diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index b2e5c21d..e3b51820 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -24,58 +24,4 @@ public class LanguageModels { this.fasttextLanguageModel = fasttextLanguageModel; this.segments = segments; } - - public static LanguageModelsBuilder builder() { - return new LanguageModelsBuilder(); - } - - public static class LanguageModelsBuilder { - private Path termFrequencies; - private Path openNLPSentenceDetectionData; - private Path posRules; - private Path posDict; - private Path fasttextLanguageModel; - private Path segments; - - LanguageModelsBuilder() { - } - - public LanguageModelsBuilder termFrequencies(Path termFrequencies) { - this.termFrequencies = termFrequencies; - return this; - } - - public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) { - this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; - return this; - } - - public LanguageModelsBuilder posRules(Path posRules) { - this.posRules = posRules; - return this; - } - - public LanguageModelsBuilder posDict(Path posDict) { - this.posDict = posDict; - return this; - } - - public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) { - this.fasttextLanguageModel = fasttextLanguageModel; - return this; - } - - public LanguageModelsBuilder segments(Path segments) { - this.segments = segments; - return this; - } - - public LanguageModels build() { - return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments); - } - - public String toString() { - return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")"; - } - } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index ae611b12..59409c50 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -35,6 +35,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; +import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.Optional; @@ -202,13 +203,19 @@ public class ConverterMain extends ProcessMainClass { heartbeat.setProgress(processedDomains.get() / (double) totalDomains); logger.info("Processing small items"); - int numBigTasks = 0; + + // We separate the large and small domains to reduce the number of critical sections, + // as the large domains have a separate processing track that doesn't store everything + // in memory + + final List bigTasks = new ArrayList<>(); + // First process the small items for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(), new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) { if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) { - numBigTasks ++; + bigTasks.add(dataPath); continue; } @@ -239,15 +246,8 @@ public class ConverterMain extends ProcessMainClass { try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) { int bigTaskIdx = 0; // Next the big items domain-by-domain - for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(), - new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) - { - int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath); - if (sizeHint < SIDELOAD_THRESHOLD) { - continue; - } - - hb.progress(dataPath.toFile().getName(), bigTaskIdx++, numBigTasks); + for (var dataPath : bigTasks) { + hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size()); try { // SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be @@ -255,7 +255,7 @@ public class ConverterMain extends ProcessMainClass { // will close it after it's consumed. var stream = SerializableCrawlDataStream.openDataStream(dataPath); - ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint); + ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath)); converterWriter.accept(writable); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java index 74eecdd0..bba4f417 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java @@ -116,7 +116,7 @@ public class AdblockSimulator { // Refrain from cleaning up this code, it's very hot code and needs to be fast. - // This version is about 100x faster than the a "clean" first stab implementation. + // This version is about 100x faster than a "clean" first stab implementation. class RuleVisitor implements NodeFilter { public boolean sawAds; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java index e6a87089..c67860d2 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentGeneratorExtractor.java @@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor { var tags = doc.select("meta[name=generator]"); - if (tags.size() == 0) { + if (tags.isEmpty()) { // Some sites have a comment in the head instead of a meta tag return fingerprintServerTech(doc, responseHeaders); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index bdab33c3..f296a526 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC } fullHtml.append(""); - var doc = sideloaderProcessing + return sideloaderProcessing .processDocument(fullUrl, fullHtml.toString(), List.of("encyclopedia", "wiki"), @@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)), LocalDate.now().getYear(), 10_000_000); - - return doc; } private String normalizeUtf8(String url) {