mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(ngram) Use simple blocking pool instead of FJP; split on underscores in article names.
This commit is contained in:
parent
8a81a480a1
commit
f064992137
@ -23,6 +23,7 @@ dependencies {
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:libraries:blocking-thread-pool')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.notnull
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.segmentation;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.*;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.jsoup.nodes.Document;
|
||||
import org.openzim.ZIMTypes.ZIMFile;
|
||||
@ -11,12 +12,12 @@ import java.io.IOException;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
public class NgramExtractorMain {
|
||||
static MurmurHash3_128 hash = new MurmurHash3_128();
|
||||
|
||||
public static void main(String... args) {
|
||||
public static void main(String... args) throws IOException, InterruptedException {
|
||||
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
|
||||
Path.of("/tmp/ngram-counts.bin"));
|
||||
}
|
||||
|
||||
private static List<String> getNgramTitleTerms(String title) {
|
||||
@ -102,21 +103,25 @@ public class NgramExtractorMain {
|
||||
|
||||
var orderedHasher = HasherGroup.ordered();
|
||||
|
||||
try (var executor = Executors.newWorkStealingPool()) {
|
||||
var pool = new SimpleBlockingThreadPool("ngram-extractor",
|
||||
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
|
||||
32
|
||||
);
|
||||
|
||||
reader.forEachArticles((title, body) -> {
|
||||
executor.submit(() -> {
|
||||
pool.submitQuietly(() -> {
|
||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||
LongArrayList orderedHashesBody = new LongArrayList();
|
||||
|
||||
for (var sent : getNgramTitleTerms(title)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
String normalizedTitle = title.replace('_', ' ');
|
||||
|
||||
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
|
||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||
|
||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||
}
|
||||
|
||||
@ -131,7 +136,9 @@ public class NgramExtractorMain {
|
||||
});
|
||||
|
||||
}, p -> true);
|
||||
}
|
||||
|
||||
pool.shutDown();
|
||||
pool.awaitTermination(10, TimeUnit.DAYS);
|
||||
|
||||
lexicon.saveCounts(countsOutputFile);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user