From b6511fbfe29c30c1e418435f946d51e26144238a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 9 Dec 2023 13:23:21 +0100 Subject: [PATCH] (converter) Add AnchorTextKeywords to EncyclopediaMarginaliaNuSideloader processing The commit updates EncyclopediaMarginaliaNuSideloader to include the AnchorTextKeywords in processing documents, aiding search result relevance. It also removes old test-related functionality and a large but fairly useless test previously used to debug a specific problem, to the detriment of the overall code quality. --- .../sideload/SideloadSourceFactory.java | 7 +- .../EncyclopediaMarginaliaNuSideloader.java | 45 +++---- ...ncyclopediaMarginaliaNuSideloaderTest.java | 114 ------------------ 3 files changed, 22 insertions(+), 144 deletions(-) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java index debc460f..60f81d19 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/SideloadSourceFactory.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.sideload; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.sideload.dirtree.DirtreeSideloaderFactory; import nu.marginalia.converting.sideload.encyclopedia.EncyclopediaMarginaliaNuSideloader; @@ -20,6 +21,7 @@ public class SideloadSourceFactory { private final SideloaderProcessing sideloaderProcessing; private final ThreadLocalSentenceExtractorProvider sentenceExtractorProvider; private final DocumentKeywordExtractor documentKeywordExtractor; + private final AnchorTextKeywords anchorTextKeywords; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private final DirtreeSideloaderFactory dirtreeSideloaderFactory; @@ -27,19 +29,20 @@ public class SideloadSourceFactory { public SideloadSourceFactory(Gson gson, SideloaderProcessing sideloaderProcessing, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, - DocumentKeywordExtractor documentKeywordExtractor, + DocumentKeywordExtractor documentKeywordExtractor, AnchorTextKeywords anchorTextKeywords, AnchorTagsSourceFactory anchorTagsSourceFactory, DirtreeSideloaderFactory dirtreeSideloaderFactory) { this.gson = gson; this.sideloaderProcessing = sideloaderProcessing; this.sentenceExtractorProvider = sentenceExtractorProvider; this.documentKeywordExtractor = documentKeywordExtractor; + this.anchorTextKeywords = anchorTextKeywords; this.anchorTagsSourceFactory = anchorTagsSourceFactory; this.dirtreeSideloaderFactory = dirtreeSideloaderFactory; } public SideloadSource sideloadEncyclopediaMarginaliaNu(Path pathToDbFile, String baseUrl) throws SQLException { - return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, sideloaderProcessing); + return new EncyclopediaMarginaliaNuSideloader(pathToDbFile, baseUrl, gson, anchorTagsSourceFactory, anchorTextKeywords, sideloaderProcessing); } public Collection sideloadDirtree(Path pathToYamlFile) throws IOException { diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index aab62ef9..fc1f5015 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -3,6 +3,7 @@ package nu.marginalia.converting.sideload.encyclopedia; import com.github.luben.zstd.ZstdInputStream; import com.google.gson.Gson; import lombok.SneakyThrows; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.DisqualifiedException; @@ -35,7 +36,7 @@ import java.util.concurrent.atomic.AtomicBoolean; /** This is an experimental sideloader for encyclopedia.marginalia.nu's database; * (which serves as a way of loading wikipedia's zim files without binding to GPL2'd code) - * + *

* See https://github.com/MarginaliaSearch/encyclopedia.marginalia.nu for extracting the data */ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoCloseable { @@ -43,6 +44,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC private final Connection connection; private final EdgeUrl baseUrl; private final Gson gson; + private final AnchorTextKeywords anchorTextKeywords; private final SideloaderProcessing sideloaderProcessing; private final AnchorTagsSourceFactory anchorTagsSourceFactory; private static final Logger logger = LoggerFactory.getLogger(EncyclopediaMarginaliaNuSideloader.class); @@ -51,9 +53,11 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC String baseUrl, Gson gson, AnchorTagsSourceFactory anchorTagsSourceFactory, + AnchorTextKeywords anchorTextKeywords, SideloaderProcessing sideloaderProcessing) throws SQLException { this.baseUrl = EdgeUrl.parse(baseUrl).orElseThrow(AssertionError::new); this.gson = gson; + this.anchorTextKeywords = anchorTextKeywords; this.sideloaderProcessing = sideloaderProcessing; String sqliteDbString = "jdbc:sqlite:" + pathToDbFile.toString(); @@ -103,7 +107,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC try { docs.add(convertDocument(articleParts.parts, title, url, domainLinks)); } catch (URISyntaxException | DisqualifiedException e) { - e.printStackTrace(); + logger.warn("Problem converting encyclopedia article " + url, e); } finally { sem.release(); } @@ -113,7 +117,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC stmt.close(); } catch (Exception e) { - e.printStackTrace(); + logger.warn("Problem converting encyclopedia article", e); } finally { isFinished.set(true); @@ -142,30 +146,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC logger.error("Failed to create anchor tags source", ex); return new DomainLinks(); } - - } - - ProcessedDocument processJust(String url) throws SQLException, IOException, URISyntaxException, DisqualifiedException { - var stmt = connection.prepareStatement(""" - SELECT url,title,html - FROM articles - WHERE url=? - """); - stmt.setFetchSize(100); - stmt.setString(1, url); - - var rs = stmt.executeQuery(); - if (rs.next()) { - var articleParts = fromCompressedJson(rs.getBytes("html"), ArticleParts.class); - String title = rs.getString("title"); - - return convertDocument(articleParts.parts, - title, - URLEncoder.encode(rs.getString("url"), StandardCharsets.UTF_8), - new DomainLinks() // FIXME (2023-11-06): Sideloaded dirtrees don't have access to anchor tag data. - ); - } - return null; } private ProcessedDocument convertDocument(List parts, String title, String url, DomainLinks domainLinks) throws URISyntaxException, DisqualifiedException { @@ -180,13 +160,22 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC } fullHtml.append(""); - return sideloaderProcessing + var doc = sideloaderProcessing .processDocument(fullUrl, fullHtml.toString(), List.of("encyclopedia", "wiki"), domainLinks, GeneratorType.WIKI, 10_000_000); + + // Add anchor text keywords + if (doc.isProcessedFully()) { + doc.words.addAnchorTerms( + anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url) + ); + } + + return doc; } private T fromCompressedJson(byte[] stream, Class type) throws IOException { diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java index 0b1b6904..fd4aa73a 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloaderTest.java @@ -1,38 +1,12 @@ package nu.marginalia.converting.sideload.encyclopedia; -import com.google.inject.AbstractModule; -import com.google.inject.Guice; -import gnu.trove.list.array.TLongArrayList; -import nu.marginalia.atags.source.AnchorTagsSourceFactory; -import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.converting.ConverterModule; -import nu.marginalia.converting.model.DisqualifiedException; -import nu.marginalia.converting.processor.ConverterDomainTypes; -import nu.marginalia.converting.sideload.SideloaderProcessing; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.processed.DocumentRecord; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.mockito.Mockito; import java.io.IOException; -import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Path; -import java.sql.SQLException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static org.junit.jupiter.api.Assertions.*; -import static org.mockito.Mockito.when; class EncyclopediaMarginaliaNuSideloaderTest { Path tempFile; @@ -62,93 +36,5 @@ class EncyclopediaMarginaliaNuSideloaderTest { System.out.printf("%64s\n", Long.toBinaryString(Long.reverseBytes(0x1000000000000000L))); System.out.printf("%64s\n", Long.toBinaryString(0x10L)); } - @Test - public void debugSpecificArticle() throws SQLException, IOException, URISyntaxException, DisqualifiedException { - Path pathToDbFile = Path.of("/home/vlofgren/Code/MarginaliaSearch/run/samples/articles.db"); - if (!Files.exists(pathToDbFile)) { - // not really practical to ship a 40 Gb sqlite files on github - // be @vlofgren to run this test - return; - } - var domainTypesMock = Mockito.mock(ConverterDomainTypes.class); - when(domainTypesMock.isBlog(Mockito.any())).thenReturn(false); - var processing = Guice.createInjector(new ConverterModule(), - new AbstractModule() { - public void configure() { - bind(ConverterDomainTypes.class).toInstance(domainTypesMock); - } - } - ) - .getInstance(SideloaderProcessing.class); - var atagsFactory = Mockito.mock(AnchorTagsSourceFactory.class); - when(atagsFactory.create(Mockito.any())).thenReturn(domain -> new DomainLinks()); - - var sideloader = new EncyclopediaMarginaliaNuSideloader( - pathToDbFile, - "https://en.wikipedia.org/wiki/", - GsonFactory.get(), - atagsFactory, - processing - ); - - var document = sideloader.processJust("Don't_Tell_Me_(Madonna_song)"); - - System.out.println(document); - - var keywordsBuilt = document.words.build(); - - var ptr = keywordsBuilt.newPointer(); - - Map dirtyAndBlues = new HashMap<>(); - - while (ptr.advancePointer()) { - String word = ptr.getKeyword(); - - System.out.println(word + ": " + Long.toHexString(Long.reverseBytes(ptr.getMetadata()))); - - if (Set.of("dirty", "blues").contains(word)) { - WordMetadata meta = new WordMetadata(ptr.getMetadata()); - - Assertions.assertNull( - dirtyAndBlues.put(word, meta) - ); - } - } - - Assertions.assertTrue(dirtyAndBlues.containsKey("dirty")); - Assertions.assertTrue(dirtyAndBlues.containsKey("blues")); - Assertions.assertNotEquals( - dirtyAndBlues.get("dirty"), - dirtyAndBlues.get("blues") - ); - - try (var dw = new DocumentRecordParquetFileWriter(tempFile)) { - dw.write(new DocumentRecord( - "encyclopedia.marginalia.nu", - document.url.toString(), - 0, - document.state.toString(), - document.stateReason, - document.details.title, - document.details.description, - HtmlFeature.encode(document.details.features), - document.details.standard.name(), - document.details.length, - document.details.hashCode, - (float) document.details.quality, - document.details.metadata.encode(), - document.details.pubYear, - List.of(keywordsBuilt.keywords), - new TLongArrayList(keywordsBuilt.metadata) - )); - } - - var record = DocumentRecordParquetFileReader.streamKeywordsProjection(tempFile).findFirst().get(); - String[] words = record.words.toArray(String[]::new); - long[] meta = record.metas.toArray(); - - assertArrayEquals(keywordsBuilt.keywords, words); - assertArrayEquals(keywordsBuilt.metadata, meta); - } } \ No newline at end of file