(keywords) Clean up leaky abstractions, clean up tests

2025-02-23 21:18:58 +00:00 · 2023-09-01 13:52:00 +02:00 · 2023-09-01 13:52:00 +02:00 · 5f427d2b4c
commit 5f427d2b4c
parent 8c0ce4fc1d
6 changed files with 105 additions and 132 deletions
--- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java
+++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywords.java
@ -3,26 +3,32 @@ package nu.marginalia.keyword.model;

 import nu.marginalia.model.idx.WordMetadata;

-import java.io.Serializable;
-import java.util.Arrays;
+public final class DocumentKeywords {
+    final String[] keywords;
+    final long[] metadata;

-public record DocumentKeywords(String[] keywords,
-                               long[] metadata)
-implements Serializable
-{
+    public DocumentKeywords(String[] keywords,
+                            long[] metadata)
+    {
+        this.keywords = keywords;
+        this.metadata = metadata;
+    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append(getClass().getSimpleName());
        sb.append('[');
-        for (int i = 0; i < keywords.length; i++) {
+        var pointer = newPointer();
+        while (pointer.advancePointer()) {
            sb.append("\n\t ");
-            if (metadata[i] != 0) {
-                sb.append(keywords[i]).append("/").append(new WordMetadata(metadata[i]));
-            }
-            else {
-                sb.append(keywords[i]);
+
+            long metadata = pointer.getMetadata();
+            String keyword = pointer.getKeyword();
+            sb.append(keyword);
+
+            if (metadata != 0) {
+                sb.append("/").append(new WordMetadata(metadata));
            }
        }
        return sb.append("\n]").toString();
@ -36,7 +42,11 @@ implements Serializable
        return keywords.length;
    }

-    public DocumentKeywords subList(int start, int end) {
-        return new DocumentKeywords(Arrays.copyOfRange(keywords, start, end), Arrays.copyOfRange(metadata, start, end));
+    /** Return a pointer for traversing this structure */
+    public DocumentKeywordsPointer newPointer() {
+        return new DocumentKeywordsPointer(this);
    }
+
 }
+
+
--- a/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java
+++ b/code/features-convert/keyword-extraction/src/main/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java
@ -0,0 +1,41 @@
+package nu.marginalia.keyword.model;
+
+/** Pointer into a {@see DocumentKeywords}.  It starts out before the first position,
+ * forward with advancePointer().
+ * */
+public class DocumentKeywordsPointer {
+    private int pos = -1;
+
+    private final DocumentKeywords keywords;
+
+    DocumentKeywordsPointer(DocumentKeywords keywords) {
+        this.keywords = keywords;
+    }
+
+    /** Number of positions remaining */
+    public int remaining() {
+        return keywords.size() - Math.max(0, pos);
+    }
+
+    /** Return the keyword associated with the current position */
+    public String getKeyword() {
+        return keywords.keywords[pos];
+    }
+
+    /** Return the metadata associated with the current position */
+    public long getMetadata() {
+        return keywords.metadata[pos];
+    }
+
+    /** Advance the current position,
+     * returns false if this was the
+     * last position */
+    public boolean advancePointer() {
+        return ++pos < keywords.size();
+    }
+
+    /** Returns true unless the pointer is beyond the last position in the keyword set */
+    public boolean hasMore() {
+        return pos + 1 < keywords.size();
+    }
+}
--- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
+++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/DocumentKeywordExtractorTest.java
@ -17,10 +17,11 @@ import java.util.Objects;

 class DocumentKeywordExtractorTest {

+    DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
+    SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
+
    @Test
    public void testWordPattern() {
-        DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(null);
-
        Assertions.assertTrue(extractor.matchesWordPattern("test"));
        Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde"));
        Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef"));
@ -34,6 +35,24 @@ class DocumentKeywordExtractorTest {
        Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse"));
    }

+
+    @Test
+    public void testEmptyMetadata() throws URISyntaxException {
+        var dld = se.extractSentences("""
+                Some sample text, I'm not sure what even triggers this
+                """, "A title perhaps?");
+        var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid"));
+        var keywords = keywordBuilder.build();
+
+        var pointer = keywords.newPointer();
+        while (pointer.advancePointer()) {
+            if (pointer.getMetadata() == 0L) {
+                System.out.println("Aha! " + pointer.getKeyword());
+            }
+        }
+
+    }
+
    @Test
    public void testKeyboards() throws IOException, URISyntaxException {
        var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"),
@ -42,9 +61,6 @@ class DocumentKeywordExtractorTest {
        var doc = Jsoup.parse(html);
        doc.filter(new DomPruningFilter(0.5));

-        DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels()));
-        SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels());
-
        var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/"));
        System.out.println(keywords.getMetaForWord("mechanical"));
        System.out.println(keywords.getMetaForWord("keyboard"));
--- a/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/SentenceExtractorTest.java
+++ b/code/features-convert/keyword-extraction/src/test/java/nu/marginalia/keyword/SentenceExtractorTest.java
@ -26,17 +26,9 @@ import static org.junit.jupiter.api.Assertions.assertEquals;

@Tag("slow")
 class SentenceExtractorTest {
-    SentenceExtractor newSe;
-    SentenceExtractor legacySe;
    final LanguageModels lm = TestLanguageModels.getLanguageModels();

-    @BeforeEach
-    public void setUp() {
-
-        newSe = new SentenceExtractor(lm);
-        legacySe = new SentenceExtractor(lm);
-    }
-
+    SentenceExtractor se = new SentenceExtractor(lm);

    @SneakyThrows
    public static void main(String... args) throws IOException {
@ -65,69 +57,16 @@ class SentenceExtractorTest {
        }
    }

-    @SneakyThrows
-    @Test
-    void testExtractSubject() {
-        var data = WmsaHome.getHomePath().resolve("test-data/");
-
-        System.out.println("Running");
-
-        SentenceExtractor se = new SentenceExtractor(lm);
-        KeywordExtractor keywordExtractor = new KeywordExtractor();
-
-        for (var file : Objects.requireNonNull(data.toFile().listFiles())) {
-            System.out.println(file);
-            var dld = se.extractSentences(Jsoup.parse(Files.readString(file.toPath())));
-            Map<String, Integer> counts = new HashMap<>();
-            for (var sentence : dld.sentences) {
-                for (WordSpan kw : keywordExtractor.getProperNames(sentence)) {
-                    if (kw.end + 2 >= sentence.length()) {
-                        continue;
-                    }
-                    if (sentence.separators[kw.end] == WordSeparator.COMMA
-                            || sentence.separators[kw.end + 1] == WordSeparator.COMMA)
-                        break;
-
-                    if (("VBZ".equals(sentence.posTags[kw.end]) || "VBP".equals(sentence.posTags[kw.end]))
-                            && ("DT".equals(sentence.posTags[kw.end + 1]) || "RB".equals(sentence.posTags[kw.end]) || sentence.posTags[kw.end].startsWith("VB"))
-                    ) {
-                        counts.merge(new WordRep(sentence, new WordSpan(kw.start, kw.end)).word, -1, Integer::sum);
-                    }
-                }
-            }
-
-            int best = counts.values().stream().mapToInt(Integer::valueOf).min().orElse(0);
-
-            counts.entrySet().stream().sorted(Map.Entry.comparingByValue())
-                    .filter(e -> e.getValue()<-2 && e.getValue()<best*0.75)
-                    .forEach(System.out::println);
-        }
-
-    }
-
-
-    @SneakyThrows
-    @Test
-    @Disabled
-    public void testSE() {
-        var result = newSe.extractSentences(
-                Jsoup.parse(Files.readString(Path.of("/home/vlofgren/man open (2) openat.html"))));
-
-        var dict = new TermFrequencyDict(lm);
-        System.out.println(new DocumentKeywordExtractor(dict).extractKeywords(result, new EdgeUrl("https://memex.marginalia.nu/")));
-    }
-
    @Test
    public void separatorExtraction() {
        seprateExtractor("Cookies, cream and shoes");
        seprateExtractor("Cookies");
        seprateExtractor("");
-
    }

    @Test
    public void testACDC() {
-        var ret = newSe.extractSentence("AC/DC is a rock band.");
+        var ret = se.extractSentence("AC/DC is a rock band.");
        assertEquals("AC/DC", ret.words[0]);
    }

@ -139,7 +78,6 @@ class SentenceExtractorTest {
        List<String> words = new ArrayList<>();
        List<String> separators = new ArrayList<>();

-        int start = 0;
        int wordStart = 0;
        while (wordStart <= sentence.length()) {
            if (!matcher.find(wordStart)) {
--- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/KeywordListChunker.java
+++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/KeywordListChunker.java
@ -1,34 +0,0 @@
-package nu.marginalia.loading.loader;
-
-
-import nu.marginalia.keyword.model.DocumentKeywords;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-public class KeywordListChunker {
-
-    /** Chops data into a list of lists of max length size
-     *
-     * Caveat: Relies on subList and does not clone "data", so
-     * changes to the original list may affect the sub-lists
-     * in unspecified ways
-     *
-     * @see List#subList
-     */
-    public static List<DocumentKeywords> chopList(DocumentKeywords data, int size) {
-        if (data.isEmpty())
-            return Collections.emptyList();
-        else if (data.size() < size)
-            return List.of(data);
-
-        final List<DocumentKeywords> ret = new ArrayList<>(1 + data.size() / size);
-
-        for (int i = 0; i < data.size(); i+=size) {
-            ret.add(data.subList(i, Math.min(data.size(), i+size)));
-        }
-
-        return ret;
-    }
-}
--- a/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java
+++ b/code/processes/loading-process/src/main/java/nu/marginalia/loading/loader/LoaderIndexJournalWriter.java
@ -9,7 +9,6 @@ import nu.marginalia.hash.MurmurHash3_128;
 import nu.marginalia.index.journal.model.IndexJournalEntryData;
 import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
 import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
-import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
 import nu.marginalia.index.journal.writer.IndexJournalWriter;
 import nu.marginalia.keyword.model.DocumentKeywords;
 import nu.marginalia.model.idx.DocumentMetadata;
@ -20,7 +19,6 @@ import org.slf4j.LoggerFactory;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.sql.SQLException;
-import java.util.Arrays;

 import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;

@ -30,6 +28,10 @@ public class LoaderIndexJournalWriter {
    private final IndexJournalWriter indexWriter;
    private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);

+    private final MurmurHash3_128 hasher = new MurmurHash3_128();
+    private final long[] buffer = new long[MAX_LENGTH * 2];
+
+
    @Inject
    public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
        var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
@ -42,14 +44,13 @@ public class LoaderIndexJournalWriter {
        indexWriter = new IndexJournalWriterPagingImpl(indexArea.asPath());
    }

-    MurmurHash3_128 hasher = new MurmurHash3_128();
-    long[] buffer = new long[MAX_LENGTH * 2];
    @SneakyThrows
    public void putWords(long combinedId,
                         int features,
                         DocumentMetadata metadata,
                         DocumentKeywords wordSet) {
-        if (wordSet.keywords().length == 0) {
+
+        if (wordSet.isEmpty()) {
            logger.info("Skipping zero-length word set for {}", combinedId);
            return;
        }
@ -59,23 +60,24 @@ public class LoaderIndexJournalWriter {
            return;
        }

-        String[] words = wordSet.keywords();
-        long[] meta = wordSet.metadata();
+        var pointer = wordSet.newPointer();

-        for (int start = 0; start < words.length; ) {
-            int end = Math.min(start + MAX_LENGTH, words.length);
+        while (pointer.hasMore()) {
+            int i = 0;

-            for (int i = 0; i < end - start; i++) {
-                buffer[2*i] = hasher.hashNearlyASCII(words[start+i]);
-                buffer[2*i + 1] = meta[start+i];
+            while (i < buffer.length
+                && pointer.advancePointer())
+            {
+                final long hashedKeyword = hasher.hashNearlyASCII(pointer.getKeyword());
+
+                buffer[i++] = hashedKeyword;
+                buffer[i++] = pointer.getMetadata();
            }

-            var entry = new IndexJournalEntryData(2 * (end-start), buffer);
+            var entry = new IndexJournalEntryData(i, buffer);
            var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode());

            indexWriter.put(header, entry);
-
-            start = end;
        }

    }