(ngram) Correct size value in ngram lexicon generation, trim the terms better

This commit is contained in:
Viktor Lofgren 2024-04-13 17:51:02 +02:00
parent f064992137
commit 1329d4abd8
2 changed files with 31 additions and 10 deletions

View File

@ -1,7 +1,6 @@
package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit;
public class NgramExtractorMain {
public static void main(String... args) throws IOException, InterruptedException {
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
Path.of("/tmp/ngram-counts.bin"));
}
private static List<String> getNgramTitleTerms(String title) {
@ -64,7 +61,6 @@ public class NgramExtractorMain {
private static List<String> cleanTerms(List<String> terms) {
// Trim the discovered terms
terms.replaceAll(s -> {
// Remove trailing parentheses and their contents
if (s.endsWith(")")) {
int idx = s.lastIndexOf('(');
@ -73,6 +69,10 @@ public class NgramExtractorMain {
}
}
return s;
});
terms.replaceAll(s -> {
// Remove leading "list of "
if (s.startsWith("list of ")) {
return s.substring("list of ".length());
@ -81,6 +81,15 @@ public class NgramExtractorMain {
return s;
});
terms.replaceAll(s -> {
// Remove trailing punctuation
if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) {
return s.substring(0, s.length() - 1);
}
return s;
});
// Remove terms that are too short or too long
terms.removeIf(s -> {
if (!s.contains(" "))

View File

@ -21,6 +21,7 @@ import java.util.List;
public class NgramLexicon {
private final Long2IntOpenCustomHashMap counts;
private int size;
private static final HasherGroup orderedHasher = HasherGroup.ordered();
@Inject
@ -31,9 +32,15 @@ public class NgramLexicon {
(int) size,
new KeyIsAlreadyHashStrategy()
);
counts.defaultReturnValue(0);
for (int i = 0; i < size; i++) {
counts.put(dis.readLong(), dis.readInt());
try {
for (int i = 0; i < size; i++) {
counts.put(dis.readLong(), dis.readInt());
}
}
catch (IOException ex) {
ex.printStackTrace();
}
} catch (IOException e) {
throw new RuntimeException(e);
@ -137,8 +144,12 @@ public class NgramLexicon {
public void incOrderedTitle(long hashOrdered) {
int value = counts.get(hashOrdered);
if (value < 0) value = -value + 1;
else value ++;
if (value <= 0) {
size ++;
value = -value;
}
value ++;
counts.put(hashOrdered, value);
}
@ -147,7 +158,7 @@ public class NgramLexicon {
int value = counts.get(hashOrdered);
if (value <= 0) value --;
else value ++;
else value++;
counts.put(hashOrdered, value);
}
@ -157,7 +168,8 @@ public class NgramLexicon {
StandardOpenOption.CREATE,
StandardOpenOption.TRUNCATE_EXISTING,
StandardOpenOption.WRITE))) {
dos.writeInt(counts.size());
dos.writeInt(size);
counts.forEach((k, v) -> {
try {