(ngram) Use simple blocking pool instead of FJP; split on underscores in article names.

This commit is contained in:
Viktor Lofgren 2024-04-13 17:07:23 +02:00
parent 8a81a480a1
commit f064992137
2 changed files with 36 additions and 28 deletions

View File

@ -23,6 +23,7 @@ dependencies {
implementation project(':code:common:config') implementation project(':code:common:config')
implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:array') implementation project(':code:libraries:array')
implementation project(':code:libraries:blocking-thread-pool')
implementation libs.bundles.slf4j implementation libs.bundles.slf4j
implementation libs.notnull implementation libs.notnull

View File

@ -2,6 +2,7 @@ package nu.marginalia.segmentation;
import it.unimi.dsi.fastutil.longs.*; import it.unimi.dsi.fastutil.longs.*;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.jsoup.nodes.Document; import org.jsoup.nodes.Document;
import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMFile;
@ -11,12 +12,12 @@ import java.io.IOException;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit;
public class NgramExtractorMain { public class NgramExtractorMain {
static MurmurHash3_128 hash = new MurmurHash3_128(); public static void main(String... args) throws IOException, InterruptedException {
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
public static void main(String... args) { Path.of("/tmp/ngram-counts.bin"));
} }
private static List<String> getNgramTitleTerms(String title) { private static List<String> getNgramTitleTerms(String title) {
@ -102,21 +103,25 @@ public class NgramExtractorMain {
var orderedHasher = HasherGroup.ordered(); var orderedHasher = HasherGroup.ordered();
try (var executor = Executors.newWorkStealingPool()) { var pool = new SimpleBlockingThreadPool("ngram-extractor",
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
32
);
reader.forEachArticles((title, body) -> { reader.forEachArticles((title, body) -> {
executor.submit(() -> { pool.submitQuietly(() -> {
LongArrayList orderedHashesTitle = new LongArrayList(); LongArrayList orderedHashesTitle = new LongArrayList();
LongArrayList orderedHashesBody = new LongArrayList(); LongArrayList orderedHashesBody = new LongArrayList();
for (var sent : getNgramTitleTerms(title)) { String normalizedTitle = title.replace('_', ' ');
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
for (var sent : getNgramTitleTerms(normalizedTitle)) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesTitle.add(orderedHasher.rollingHash(terms)); orderedHashesTitle.add(orderedHasher.rollingHash(terms));
} }
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
String[] terms = BasicSentenceExtractor.getStemmedParts(sent); String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
orderedHashesBody.add(orderedHasher.rollingHash(terms)); orderedHashesBody.add(orderedHasher.rollingHash(terms));
} }
@ -131,7 +136,9 @@ public class NgramExtractorMain {
}); });
}, p -> true); }, p -> true);
}
pool.shutDown();
pool.awaitTermination(10, TimeUnit.DAYS);
lexicon.saveCounts(countsOutputFile); lexicon.saveCounts(countsOutputFile);
} }