(ngram) Correct size value in ngram lexicon generation, trim the terms better

2025-02-24 05:18:58 +00:00 · 2024-04-13 17:51:02 +02:00 · 2024-04-13 17:51:02 +02:00 · 1329d4abd8
commit 1329d4abd8
parent f064992137
2 changed files with 31 additions and 10 deletions
--- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java
+++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java
@ -1,7 +1,6 @@
 package nu.marginalia.segmentation;

 import it.unimi.dsi.fastutil.longs.*;
-import nu.marginalia.hash.MurmurHash3_128;
 import nu.marginalia.util.SimpleBlockingThreadPool;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit;

 public class NgramExtractorMain {
    public static void main(String... args) throws IOException, InterruptedException {
-        dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
-                Path.of("/tmp/ngram-counts.bin"));
    }

    private static List<String> getNgramTitleTerms(String title) {
@ -64,7 +61,6 @@ public class NgramExtractorMain {
    private static List<String> cleanTerms(List<String> terms) {
        // Trim the discovered terms
        terms.replaceAll(s -> {
-
            // Remove trailing parentheses and their contents
            if (s.endsWith(")")) {
                int idx = s.lastIndexOf('(');
@ -73,6 +69,10 @@ public class NgramExtractorMain {
                }
            }

+            return s;
+        });
+
+        terms.replaceAll(s -> {
            // Remove leading "list of "
            if (s.startsWith("list of ")) {
                return s.substring("list of ".length());
@ -81,6 +81,15 @@ public class NgramExtractorMain {
            return s;
        });

+        terms.replaceAll(s -> {
+            // Remove trailing punctuation
+            if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) {
+                return s.substring(0, s.length() - 1);
+            }
+
+            return s;
+        });
+
        // Remove terms that are too short or too long
        terms.removeIf(s -> {
            if (!s.contains(" "))
--- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java
+++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java
@ -21,6 +21,7 @@ import java.util.List;
 public class NgramLexicon {
    private final Long2IntOpenCustomHashMap counts;

+    private int size;
    private static final HasherGroup orderedHasher = HasherGroup.ordered();

    @Inject
@ -31,10 +32,16 @@ public class NgramLexicon {
                    (int) size,
                    new KeyIsAlreadyHashStrategy()
            );
+            counts.defaultReturnValue(0);

+            try {
                for (int i = 0; i < size; i++) {
                    counts.put(dis.readLong(), dis.readInt());
                }
+            }
+            catch (IOException ex) {
+                ex.printStackTrace();
+            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
@ -137,8 +144,12 @@ public class NgramLexicon {
    public void incOrderedTitle(long hashOrdered) {
        int value = counts.get(hashOrdered);

-        if (value < 0) value = -value + 1;
-        else value ++;
+        if (value <= 0) {
+            size ++;
+            value = -value;
+        }
+
+        value ++;

        counts.put(hashOrdered, value);
    }
@ -147,7 +158,7 @@ public class NgramLexicon {
        int value = counts.get(hashOrdered);

        if (value <= 0) value --;
-        else value ++;
+        else value++;

        counts.put(hashOrdered, value);
    }
@ -157,7 +168,8 @@ public class NgramLexicon {
                StandardOpenOption.CREATE,
                StandardOpenOption.TRUNCATE_EXISTING,
                StandardOpenOption.WRITE))) {
-            dos.writeInt(counts.size());
+
+            dos.writeInt(size);

            counts.forEach((k, v) -> {
                try {