(ngram) Correct size value in ngram lexicon generation, trim the terms better

This commit is contained in:
Viktor Lofgren 2024-04-13 17:51:02 +02:00
parent f064992137
commit 1329d4abd8
2 changed files with 31 additions and 10 deletions

View File

@ -1,7 +1,6 @@
package nu.marginalia.segmentation; package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*; import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.util.SimpleBlockingThreadPool; import nu.marginalia.util.SimpleBlockingThreadPool;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit;
public class NgramExtractorMain { public class NgramExtractorMain {
public static void main(String... args) throws IOException, InterruptedException { public static void main(String... args) throws IOException, InterruptedException {
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
Path.of("/tmp/ngram-counts.bin"));
} }
private static List<String> getNgramTitleTerms(String title) { private static List<String> getNgramTitleTerms(String title) {
@ -64,7 +61,6 @@ public class NgramExtractorMain {
private static List<String> cleanTerms(List<String> terms) { private static List<String> cleanTerms(List<String> terms) {
// Trim the discovered terms // Trim the discovered terms
terms.replaceAll(s -> { terms.replaceAll(s -> {
// Remove trailing parentheses and their contents // Remove trailing parentheses and their contents
if (s.endsWith(")")) { if (s.endsWith(")")) {
int idx = s.lastIndexOf('('); int idx = s.lastIndexOf('(');
@ -73,6 +69,10 @@ public class NgramExtractorMain {
} }
} }
return s;
});
terms.replaceAll(s -> {
// Remove leading "list of " // Remove leading "list of "
if (s.startsWith("list of ")) { if (s.startsWith("list of ")) {
return s.substring("list of ".length()); return s.substring("list of ".length());
@ -81,6 +81,15 @@ public class NgramExtractorMain {
return s; return s;
}); });
terms.replaceAll(s -> {
// Remove trailing punctuation
if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) {
return s.substring(0, s.length() - 1);
}
return s;
});
// Remove terms that are too short or too long // Remove terms that are too short or too long
terms.removeIf(s -> { terms.removeIf(s -> {
if (!s.contains(" ")) if (!s.contains(" "))

View File

@ -21,6 +21,7 @@ import java.util.List;
public class NgramLexicon { public class NgramLexicon {
private final Long2IntOpenCustomHashMap counts; private final Long2IntOpenCustomHashMap counts;
private int size;
private static final HasherGroup orderedHasher = HasherGroup.ordered(); private static final HasherGroup orderedHasher = HasherGroup.ordered();
@Inject @Inject
@ -31,9 +32,15 @@ public class NgramLexicon {
(int) size, (int) size,
new KeyIsAlreadyHashStrategy() new KeyIsAlreadyHashStrategy()
); );
counts.defaultReturnValue(0);
for (int i = 0; i < size; i++) { try {
counts.put(dis.readLong(), dis.readInt()); for (int i = 0; i < size; i++) {
counts.put(dis.readLong(), dis.readInt());
}
}
catch (IOException ex) {
ex.printStackTrace();
} }
} catch (IOException e) { } catch (IOException e) {
throw new RuntimeException(e); throw new RuntimeException(e);
@ -137,8 +144,12 @@ public class NgramLexicon {
public void incOrderedTitle(long hashOrdered) { public void incOrderedTitle(long hashOrdered) {
int value = counts.get(hashOrdered); int value = counts.get(hashOrdered);
if (value < 0) value = -value + 1; if (value <= 0) {
else value ++; size ++;
value = -value;
}
value ++;
counts.put(hashOrdered, value); counts.put(hashOrdered, value);
} }
@ -147,7 +158,7 @@ public class NgramLexicon {
int value = counts.get(hashOrdered); int value = counts.get(hashOrdered);
if (value <= 0) value --; if (value <= 0) value --;
else value ++; else value++;
counts.put(hashOrdered, value); counts.put(hashOrdered, value);
} }
@ -157,7 +168,8 @@ public class NgramLexicon {
StandardOpenOption.CREATE, StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.TRUNCATE_EXISTING,
StandardOpenOption.WRITE))) { StandardOpenOption.WRITE))) {
dos.writeInt(counts.size());
dos.writeInt(size);
counts.forEach((k, v) -> { counts.forEach((k, v) -> {
try { try {