(misc) Tidying

This commit is contained in:
Viktor Lofgren 2025-01-29 15:17:04 +01:00
parent 32c6dd9e6a
commit 5fbc8ef998
5 changed files with 15 additions and 71 deletions

View File

@ -24,58 +24,4 @@ public class LanguageModels {
this.fasttextLanguageModel = fasttextLanguageModel; this.fasttextLanguageModel = fasttextLanguageModel;
this.segments = segments; this.segments = segments;
} }
public static LanguageModelsBuilder builder() {
return new LanguageModelsBuilder();
}
public static class LanguageModelsBuilder {
private Path termFrequencies;
private Path openNLPSentenceDetectionData;
private Path posRules;
private Path posDict;
private Path fasttextLanguageModel;
private Path segments;
LanguageModelsBuilder() {
}
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
this.termFrequencies = termFrequencies;
return this;
}
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
return this;
}
public LanguageModelsBuilder posRules(Path posRules) {
this.posRules = posRules;
return this;
}
public LanguageModelsBuilder posDict(Path posDict) {
this.posDict = posDict;
return this;
}
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
this.fasttextLanguageModel = fasttextLanguageModel;
return this;
}
public LanguageModelsBuilder segments(Path segments) {
this.segments = segments;
return this;
}
public LanguageModels build() {
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
}
public String toString() {
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
}
}
} }

View File

@ -35,6 +35,7 @@ import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
@ -202,13 +203,19 @@ public class ConverterMain extends ProcessMainClass {
heartbeat.setProgress(processedDomains.get() / (double) totalDomains); heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
logger.info("Processing small items"); logger.info("Processing small items");
int numBigTasks = 0;
// We separate the large and small domains to reduce the number of critical sections,
// as the large domains have a separate processing track that doesn't store everything
// in memory
final List<Path> bigTasks = new ArrayList<>();
// First process the small items // First process the small items
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(), for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
{ {
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) { if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
numBigTasks ++; bigTasks.add(dataPath);
continue; continue;
} }
@ -239,15 +246,8 @@ public class ConverterMain extends ProcessMainClass {
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) { try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
int bigTaskIdx = 0; int bigTaskIdx = 0;
// Next the big items domain-by-domain // Next the big items domain-by-domain
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(), for (var dataPath : bigTasks) {
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
{
int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
if (sizeHint < SIDELOAD_THRESHOLD) {
continue;
}
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, numBigTasks);
try { try {
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be // SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
@ -255,7 +255,7 @@ public class ConverterMain extends ProcessMainClass {
// will close it after it's consumed. // will close it after it's consumed.
var stream = SerializableCrawlDataStream.openDataStream(dataPath); var stream = SerializableCrawlDataStream.openDataStream(dataPath);
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint); ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
converterWriter.accept(writable); converterWriter.accept(writable);
} }

View File

@ -116,7 +116,7 @@ public class AdblockSimulator {
// Refrain from cleaning up this code, it's very hot code and needs to be fast. // Refrain from cleaning up this code, it's very hot code and needs to be fast.
// This version is about 100x faster than the a "clean" first stab implementation. // This version is about 100x faster than a "clean" first stab implementation.
class RuleVisitor implements NodeFilter { class RuleVisitor implements NodeFilter {
public boolean sawAds; public boolean sawAds;

View File

@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
var tags = doc.select("meta[name=generator]"); var tags = doc.select("meta[name=generator]");
if (tags.size() == 0) { if (tags.isEmpty()) {
// Some sites have a comment in the head instead of a meta tag // Some sites have a comment in the head instead of a meta tag
return fingerprintServerTech(doc, responseHeaders); return fingerprintServerTech(doc, responseHeaders);
} }

View File

@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
} }
fullHtml.append("</div></body></html>"); fullHtml.append("</div></body></html>");
var doc = sideloaderProcessing return sideloaderProcessing
.processDocument(fullUrl, .processDocument(fullUrl,
fullHtml.toString(), fullHtml.toString(),
List.of("encyclopedia", "wiki"), List.of("encyclopedia", "wiki"),
@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)), anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
LocalDate.now().getYear(), LocalDate.now().getYear(),
10_000_000); 10_000_000);
return doc;
} }
private String normalizeUtf8(String url) { private String normalizeUtf8(String url) {