mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(ngram) Use simple blocking pool instead of FJP; split on underscores in article names.
This commit is contained in:
parent
8a81a480a1
commit
f064992137
@ -23,6 +23,7 @@ dependencies {
|
|||||||
implementation project(':code:common:config')
|
implementation project(':code:common:config')
|
||||||
implementation project(':code:libraries:easy-lsh')
|
implementation project(':code:libraries:easy-lsh')
|
||||||
implementation project(':code:libraries:array')
|
implementation project(':code:libraries:array')
|
||||||
|
implementation project(':code:libraries:blocking-thread-pool')
|
||||||
|
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.segmentation;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.*;
|
import it.unimi.dsi.fastutil.longs.*;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
import nu.marginalia.util.SimpleBlockingThreadPool;
|
||||||
import org.jsoup.Jsoup;
|
import org.jsoup.Jsoup;
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
import org.openzim.ZIMTypes.ZIMFile;
|
import org.openzim.ZIMTypes.ZIMFile;
|
||||||
@ -11,12 +12,12 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
public class NgramExtractorMain {
|
public class NgramExtractorMain {
|
||||||
static MurmurHash3_128 hash = new MurmurHash3_128();
|
public static void main(String... args) throws IOException, InterruptedException {
|
||||||
|
dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"),
|
||||||
public static void main(String... args) {
|
Path.of("/tmp/ngram-counts.bin"));
|
||||||
}
|
}
|
||||||
|
|
||||||
private static List<String> getNgramTitleTerms(String title) {
|
private static List<String> getNgramTitleTerms(String title) {
|
||||||
@ -102,36 +103,42 @@ public class NgramExtractorMain {
|
|||||||
|
|
||||||
var orderedHasher = HasherGroup.ordered();
|
var orderedHasher = HasherGroup.ordered();
|
||||||
|
|
||||||
try (var executor = Executors.newWorkStealingPool()) {
|
var pool = new SimpleBlockingThreadPool("ngram-extractor",
|
||||||
reader.forEachArticles((title, body) -> {
|
Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32),
|
||||||
executor.submit(() -> {
|
32
|
||||||
LongArrayList orderedHashesTitle = new LongArrayList();
|
);
|
||||||
LongArrayList orderedHashesBody = new LongArrayList();
|
|
||||||
|
|
||||||
for (var sent : getNgramTitleTerms(title)) {
|
reader.forEachArticles((title, body) -> {
|
||||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
pool.submitQuietly(() -> {
|
||||||
|
LongArrayList orderedHashesTitle = new LongArrayList();
|
||||||
|
LongArrayList orderedHashesBody = new LongArrayList();
|
||||||
|
|
||||||
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
String normalizedTitle = title.replace('_', ' ');
|
||||||
|
|
||||||
|
for (var sent : getNgramTitleTerms(normalizedTitle)) {
|
||||||
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||||
|
orderedHashesTitle.add(orderedHasher.rollingHash(terms));
|
||||||
|
}
|
||||||
|
|
||||||
|
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
||||||
|
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
||||||
|
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
||||||
|
}
|
||||||
|
|
||||||
|
synchronized (lexicon) {
|
||||||
|
for (var hash : orderedHashesTitle) {
|
||||||
|
lexicon.incOrderedTitle(hash);
|
||||||
}
|
}
|
||||||
|
for (var hash : orderedHashesBody) {
|
||||||
for (var sent : getNgramBodyTerms(Jsoup.parse(body))) {
|
lexicon.incOrderedBody(hash);
|
||||||
String[] terms = BasicSentenceExtractor.getStemmedParts(sent);
|
|
||||||
|
|
||||||
orderedHashesBody.add(orderedHasher.rollingHash(terms));
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
synchronized (lexicon) {
|
}, p -> true);
|
||||||
for (var hash : orderedHashesTitle) {
|
|
||||||
lexicon.incOrderedTitle(hash);
|
|
||||||
}
|
|
||||||
for (var hash : orderedHashesBody) {
|
|
||||||
lexicon.incOrderedBody(hash);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
}, p -> true);
|
pool.shutDown();
|
||||||
}
|
pool.awaitTermination(10, TimeUnit.DAYS);
|
||||||
|
|
||||||
lexicon.saveCounts(countsOutputFile);
|
lexicon.saveCounts(countsOutputFile);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user