(term-freq-exporter) Reduce thread count and memory usage

This commit is contained in:
Viktor Lofgren 2024-04-10 17:11:23 +02:00
parent 491d6bec46
commit 6bfe04b609
2 changed files with 13 additions and 15 deletions

View File

@ -21,6 +21,7 @@ dependencies {
implementation project(':code:common:model')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:libraries:blocking-thread-pool')
implementation project(':code:features-crawl:link-parser')
implementation project(':code:features-convert:anchor-keywords')
implementation project(':code:process-models:crawling-model')

View File

@ -14,6 +14,7 @@ import nu.marginalia.process.log.WorkLog;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.storage.model.FileStorage;
import nu.marginalia.storage.model.FileStorageId;
import nu.marginalia.util.SimpleBlockingThreadPool;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.slf4j.Logger;
@ -53,26 +54,22 @@ public class TermFrequencyExporter implements ExporterIf {
TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1);
AtomicInteger docCount = new AtomicInteger();
try (ForkJoinPool fjp = new ForkJoinPool(Math.max(2, Runtime.getRuntime().availableProcessors() / 2))) {
SimpleBlockingThreadPool sjp = new SimpleBlockingThreadPool("exporter", Math.clamp(2, 16, Runtime.getRuntime().availableProcessors() / 2), 4);
Path crawlerLogFile = inputDir.resolve("crawler.log");
for (var item : WorkLog.iterable(crawlerLogFile)) {
if (Thread.interrupted()) {
fjp.shutdownNow();
sjp.shutDownNow();
throw new InterruptedException();
}
Path crawlDataPath = inputDir.resolve(item.relPath());
fjp.execute(() -> processFile(crawlDataPath, counts, docCount, se.get()));
sjp.submitQuietly(() -> processFile(crawlDataPath, counts, docCount, se.get()));
}
while (!fjp.isQuiescent()) {
if (fjp.awaitQuiescence(10, TimeUnit.SECONDS))
break;
}
}
sjp.shutDown();
sjp.awaitTermination(10, TimeUnit.DAYS);
var tmpFile = Files.createTempFile(destStorage.asPath(), "freqs", ".dat.tmp",
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));