mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(misc) Tidying
This commit is contained in:
parent
32c6dd9e6a
commit
5fbc8ef998
@ -24,58 +24,4 @@ public class LanguageModels {
|
|||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||||
this.segments = segments;
|
this.segments = segments;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static LanguageModelsBuilder builder() {
|
|
||||||
return new LanguageModelsBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class LanguageModelsBuilder {
|
|
||||||
private Path termFrequencies;
|
|
||||||
private Path openNLPSentenceDetectionData;
|
|
||||||
private Path posRules;
|
|
||||||
private Path posDict;
|
|
||||||
private Path fasttextLanguageModel;
|
|
||||||
private Path segments;
|
|
||||||
|
|
||||||
LanguageModelsBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
|
||||||
this.termFrequencies = termFrequencies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
|
||||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posRules(Path posRules) {
|
|
||||||
this.posRules = posRules;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder posDict(Path posDict) {
|
|
||||||
this.posDict = posDict;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
|
||||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModelsBuilder segments(Path segments) {
|
|
||||||
this.segments = segments;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LanguageModels build() {
|
|
||||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.fasttextLanguageModel, this.segments);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -35,6 +35,7 @@ import java.io.IOException;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
@ -202,13 +203,19 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
heartbeat.setProgress(processedDomains.get() / (double) totalDomains);
|
||||||
|
|
||||||
logger.info("Processing small items");
|
logger.info("Processing small items");
|
||||||
int numBigTasks = 0;
|
|
||||||
|
// We separate the large and small domains to reduce the number of critical sections,
|
||||||
|
// as the large domains have a separate processing track that doesn't store everything
|
||||||
|
// in memory
|
||||||
|
|
||||||
|
final List<Path> bigTasks = new ArrayList<>();
|
||||||
|
|
||||||
// First process the small items
|
// First process the small items
|
||||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
||||||
{
|
{
|
||||||
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
if (SerializableCrawlDataStream.getSizeHint(dataPath) >= SIDELOAD_THRESHOLD) {
|
||||||
numBigTasks ++;
|
bigTasks.add(dataPath);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -239,15 +246,8 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
|
try (var hb = heartbeat.createAdHocTaskHeartbeat("Large Domains")) {
|
||||||
int bigTaskIdx = 0;
|
int bigTaskIdx = 0;
|
||||||
// Next the big items domain-by-domain
|
// Next the big items domain-by-domain
|
||||||
for (var dataPath : WorkLog.iterableMap(crawlDir.getLogFile(),
|
for (var dataPath : bigTasks) {
|
||||||
new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog)))
|
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, bigTasks.size());
|
||||||
{
|
|
||||||
int sizeHint = SerializableCrawlDataStream.getSizeHint(dataPath);
|
|
||||||
if (sizeHint < SIDELOAD_THRESHOLD) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
hb.progress(dataPath.toFile().getName(), bigTaskIdx++, numBigTasks);
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
// SerializableCrawlDataStream is autocloseable, we can't try-with-resources because then it will be
|
||||||
@ -255,7 +255,7 @@ public class ConverterMain extends ProcessMainClass {
|
|||||||
// will close it after it's consumed.
|
// will close it after it's consumed.
|
||||||
|
|
||||||
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
var stream = SerializableCrawlDataStream.openDataStream(dataPath);
|
||||||
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, sizeHint);
|
ConverterBatchWritableIf writable = processor.simpleProcessing(stream, SerializableCrawlDataStream.getSizeHint(dataPath));
|
||||||
|
|
||||||
converterWriter.accept(writable);
|
converterWriter.accept(writable);
|
||||||
}
|
}
|
||||||
|
@ -116,7 +116,7 @@ public class AdblockSimulator {
|
|||||||
|
|
||||||
|
|
||||||
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
// Refrain from cleaning up this code, it's very hot code and needs to be fast.
|
||||||
// This version is about 100x faster than the a "clean" first stab implementation.
|
// This version is about 100x faster than a "clean" first stab implementation.
|
||||||
|
|
||||||
class RuleVisitor implements NodeFilter {
|
class RuleVisitor implements NodeFilter {
|
||||||
public boolean sawAds;
|
public boolean sawAds;
|
||||||
|
@ -23,7 +23,7 @@ public class DocumentGeneratorExtractor {
|
|||||||
|
|
||||||
var tags = doc.select("meta[name=generator]");
|
var tags = doc.select("meta[name=generator]");
|
||||||
|
|
||||||
if (tags.size() == 0) {
|
if (tags.isEmpty()) {
|
||||||
// Some sites have a comment in the head instead of a meta tag
|
// Some sites have a comment in the head instead of a meta tag
|
||||||
return fingerprintServerTech(doc, responseHeaders);
|
return fingerprintServerTech(doc, responseHeaders);
|
||||||
}
|
}
|
||||||
|
@ -127,7 +127,7 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
}
|
}
|
||||||
fullHtml.append("</div></body></html>");
|
fullHtml.append("</div></body></html>");
|
||||||
|
|
||||||
var doc = sideloaderProcessing
|
return sideloaderProcessing
|
||||||
.processDocument(fullUrl,
|
.processDocument(fullUrl,
|
||||||
fullHtml.toString(),
|
fullHtml.toString(),
|
||||||
List.of("encyclopedia", "wiki"),
|
List.of("encyclopedia", "wiki"),
|
||||||
@ -137,8 +137,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
|||||||
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)),
|
||||||
LocalDate.now().getYear(),
|
LocalDate.now().getYear(),
|
||||||
10_000_000);
|
10_000_000);
|
||||||
|
|
||||||
return doc;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private String normalizeUtf8(String url) {
|
private String normalizeUtf8(String url) {
|
||||||
|
Loading…
Reference in New Issue
Block a user