mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(converter) Add AnchorTextKeywords to EncyclopediaMarginaliaNuSideloader processing
The commit updates EncyclopediaMarginaliaNuSideloader to include the AnchorTextKeywords in processing documents, aiding search result relevance. It also removes old test-related functionality and a large but fairly useless test previously used to debug a specific problem, to the detriment of the overall code quality.
This commit is contained in:
parent
eccb12b366
commit
b6511fbfe2
@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory;
|
||||
import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader;
|
||||
@ -20,6 +21,7 @@ public class SideloadSourceFactory {
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider;
|
||||
private final DocumentKeywordExtractor documentKeywordExtractor;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private final DirtreeSideloaderFactory dirtreeSideloaderFactory;
|
||||
|
||||
@ -27,19 +29,20 @@ public class SideloadSourceFactory {
|
||||
public SideloadSourceFactory(Gson gson,
|
||||
SideloaderProcessing sideloaderProcessing,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor documentKeywordExtractor,
|
||||
DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
DirtreeSideloaderFactory dirtreeSideloaderFactory) {
|
||||
this.gson = gson;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.documentKeywordExtractor = documentKeywordExtractor;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.anchorTagsSourceFactory = anchorTagsSourceFactory;
|
||||
this.dirtreeSideloaderFactory = dirtreeSideloaderFactory;
|
||||
}
|
||||
|
||||
public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException {
|
||||
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing);
|
||||
return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing);
|
||||
}
|
||||
|
||||
public Collection<? extends SideloadSource> sideloadDirtree(Path pathToYamlFile) throws IOException {
|
||||
|
@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
@ -35,7 +36,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
/** This is an experimental sideloader for encyclopedia.marginalia.nu's database;
|
||||
* (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code)
|
||||
*
|
||||
* <p>
|
||||
* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data
|
||||
*/
|
||||
public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable {
|
||||
@ -43,6 +44,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
private final Connection connection;
|
||||
private final EdgeUrl baseUrl;
|
||||
private final Gson gson;
|
||||
private final AnchorTextKeywords anchorTextKeywords;
|
||||
private final SideloaderProcessing sideloaderProcessing;
|
||||
private final AnchorTagsSourceFactory anchorTagsSourceFactory;
|
||||
private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class);
|
||||
@ -51,9 +53,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
String baseUrl,
|
||||
Gson gson,
|
||||
AnchorTagsSourceFactory anchorTagsSourceFactory,
|
||||
AnchorTextKeywords anchorTextKeywords,
|
||||
SideloaderProcessing sideloaderProcessing) throws SQLException {
|
||||
this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new);
|
||||
this.gson = gson;
|
||||
this.anchorTextKeywords = anchorTextKeywords;
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString();
|
||||
|
||||
@ -103,7 +107,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
try {
|
||||
docs.add(convertDocument(articleParts.parts, title, url, domainLinks));
|
||||
} catch (URISyntaxException | DisqualifiedException e) {
|
||||
e.printStackTrace();
|
||||
logger.warn("Problem converting encyclopedia article " + url, e);
|
||||
} finally {
|
||||
sem.release();
|
||||
}
|
||||
@ -113,7 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
stmt.close();
|
||||
}
|
||||
catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
logger.warn("Problem converting encyclopedia article", e);
|
||||
}
|
||||
finally {
|
||||
isFinished.set(true);
|
||||
@ -142,30 +146,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
logger.error("Failed to create anchor tags source", ex);
|
||||
return new DomainLinks();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException {
|
||||
var stmt = connection.prepareStatement("""
|
||||
SELECT url,title,html
|
||||
FROM articles
|
||||
WHERE url=?
|
||||
""");
|
||||
stmt.setFetchSize(100);
|
||||
stmt.setString(1, url);
|
||||
|
||||
var rs = stmt.executeQuery();
|
||||
if (rs.next()) {
|
||||
var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class);
|
||||
String title = rs.getString("title");
|
||||
|
||||
return convertDocument(articleParts.parts,
|
||||
title,
|
||||
URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8),
|
||||
new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data.
|
||||
);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private ProcessedDocument convertDocument(List<String> parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException {
|
||||
@ -180,13 +160,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
}
|
||||
fullHtml.append("</body></html>");
|
||||
|
||||
return sideloaderProcessing
|
||||
var doc = sideloaderProcessing
|
||||
.processDocument(fullUrl,
|
||||
fullHtml.toString(),
|
||||
List.of("encyclopedia", "wiki"),
|
||||
domainLinks,
|
||||
GeneratorType.WIKI,
|
||||
10_000_000);
|
||||
|
||||
// Add anchor text keywords
|
||||
if (doc.isProcessedFully()) {
|
||||
doc.words.addAnchorTerms(
|
||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url)
|
||||
);
|
||||
}
|
||||
|
||||
return doc;
|
||||
}
|
||||
|
||||
private <T> T fromCompressedJson(byte[] stream, Class<T> type) throws IOException {
|
||||
|
@ -1,38 +1,12 @@
|
||||
package nu.marginalia.converting.sideload.encyclopedia;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Guice;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.ConverterModule;
|
||||
import nu.marginalia.converting.model.DisqualifiedException;
|
||||
import nu.marginalia.converting.processor.ConverterDomainTypes;
|
||||
import nu.marginalia.converting.sideload.SideloaderProcessing;
|
||||
import nu.marginalia.io.processed.DocumentRecordParquetFileReader;
|
||||
import nu.marginalia.io.processed.DocumentRecordParquetFileWriter;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.model.processed.DocumentRecord;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
Path tempFile;
|
||||
@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest {
|
||||
System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L)));
|
||||
System.out.printf("%64s\n", Long.toBinaryString(0x10L));
|
||||
}
|
||||
@Test
|
||||
public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException {
|
||||
Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db");
|
||||
if (!Files.exists(pathToDbFile)) {
|
||||
// not really practical to ship a 40 Gb sqlite files on github
|
||||
// be @vlofgren to run this test
|
||||
return;
|
||||
}
|
||||
var domainTypesMock = Mockito.mock(ConverterDomainTypes.class);
|
||||
when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false);
|
||||
var processing = Guice.createInjector(new ConverterModule(),
|
||||
new AbstractModule() {
|
||||
public void configure() {
|
||||
bind(ConverterDomainTypes.class).toInstance(domainTypesMock);
|
||||
}
|
||||
}
|
||||
)
|
||||
.getInstance(SideloaderProcessing.class);
|
||||
|
||||
var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class);
|
||||
when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks());
|
||||
|
||||
var sideloader = new EncyclopediaMarginaliaNuSideloader(
|
||||
pathToDbFile,
|
||||
"https://en.wikipedia.org/wiki/",
|
||||
GsonFactory.get(),
|
||||
atagsFactory,
|
||||
processing
|
||||
);
|
||||
|
||||
var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)");
|
||||
|
||||
System.out.println(document);
|
||||
|
||||
var keywordsBuilt = document.words.build();
|
||||
|
||||
var ptr = keywordsBuilt.newPointer();
|
||||
|
||||
Map<String, WordMetadata> dirtyAndBlues = new HashMap<>();
|
||||
|
||||
while (ptr.advancePointer()) {
|
||||
String word = ptr.getKeyword();
|
||||
|
||||
System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata())));
|
||||
|
||||
if (Set.of("dirty", "blues").contains(word)) {
|
||||
WordMetadata meta = new WordMetadata(ptr.getMetadata());
|
||||
|
||||
Assertions.assertNull(
|
||||
dirtyAndBlues.put(word, meta)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("dirty"));
|
||||
Assertions.assertTrue(dirtyAndBlues.containsKey("blues"));
|
||||
Assertions.assertNotEquals(
|
||||
dirtyAndBlues.get("dirty"),
|
||||
dirtyAndBlues.get("blues")
|
||||
);
|
||||
|
||||
try (var dw = new DocumentRecordParquetFileWriter(tempFile)) {
|
||||
dw.write(new DocumentRecord(
|
||||
"encyclopedia.marginalia.nu",
|
||||
document.url.toString(),
|
||||
0,
|
||||
document.state.toString(),
|
||||
document.stateReason,
|
||||
document.details.title,
|
||||
document.details.description,
|
||||
HtmlFeature.encode(document.details.features),
|
||||
document.details.standard.name(),
|
||||
document.details.length,
|
||||
document.details.hashCode,
|
||||
(float) document.details.quality,
|
||||
document.details.metadata.encode(),
|
||||
document.details.pubYear,
|
||||
List.of(keywordsBuilt.keywords),
|
||||
new TLongArrayList(keywordsBuilt.metadata)
|
||||
));
|
||||
}
|
||||
|
||||
var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get();
|
||||
String[] words = record.words.toArray(String[]::new);
|
||||
long[] meta = record.metas.toArray();
|
||||
|
||||
assertArrayEquals(keywordsBuilt.keywords, words);
|
||||
assertArrayEquals(keywordsBuilt.metadata, meta);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user