(ngram) Use simple blocking pool instead of FJP; split on underscores in article names.

This commit is contained in:
Viktor Lofgren 2024-04-13 17:07:23 +02:00
parent 8a81a480a1
commit f064992137
2 changed files with 36 additions and 28 deletions

View File

@ -23,6 +23,7 @@ dependencies {
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:array') implementation project(':code:libraries:array')
implementation project(':code:libraries:blocking-thread-pool')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.notnull implementation libs.notnull

View File

@ -2,6 +2,7 @@ package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*; import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMFile;
@ -11,12 +12,12 @@ import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit;
public class NgramExtractorMain { public class NgramExtractorMain {
static MurmurHash3_128 hash = new MurmurHash3_128(); public static void main(String... args) throws IOException, InterruptedException {
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
public static void main(String... args) { Path.of("/tmp/ngram-counts.bin"));
} }
private static List<String> getNgramTitleTerms(String title) { private static List<String> getNgramTitleTerms(String title) {
@ -102,36 +103,42 @@ public class NgramExtractorMain {
var orderedHasher = HasherGroup.ordered(); var orderedHasher = HasherGroup.ordered();
try (var executor = Executors.newWorkStealingPool()) { var pool = new SimpleBlockingThreadPool("ngram-extractor",
reader.forEachArticles((title, body) -> { Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
executor.submit(() -> { 32
LongArrayList orderedHashesTitle = new LongArrayList(); );
LongArrayList orderedHashesBody = new LongArrayList();
for (var sent : getNgramTitleTerms(title)) { reader.forEachArticles((title, body) -> {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent); pool.submitQuietly(() -> {
LongArrayList orderedHashesTitle = new LongArrayList();
LongArrayList orderedHashesBody = new LongArrayList();
orderedHashesTitle.add(orderedHasher.rollingHash(terms)); String normalizedTitle = title.replace('_', ' ');
for (var sent : getNgramTitleTerms(normalizedTitle)) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
}
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms));
}
synchronized (lexicon) {
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
} }
for (var hash : orderedHashesBody) {
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { lexicon.incOrderedBody(hash);
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms));
} }
}
});
synchronized (lexicon) { }, p -> true);
for (var hash : orderedHashesTitle) {
lexicon.incOrderedTitle(hash);
}
for (var hash : orderedHashesBody) {
lexicon.incOrderedBody(hash);
}
}
});
}, p -> true); pool.shutDown();
} pool.awaitTermination(10, TimeUnit.DAYS);
lexicon.saveCounts(countsOutputFile); lexicon.saveCounts(countsOutputFile);
} }