From a56953c7989968a4582b117c9df54ed11fc99064 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 24 Jul 2023 15:25:09 +0200 Subject: [PATCH] (converter, WIP) Refactor converter to not have to load everything into RAM. --- .../crawling/io/CrawledDomainReader.java | 11 +- .../crawling/model/CrawledDocument.java | 7 +- .../src/main/java/plan/CrawlPlan.java | 32 +++- .../marginalia/converting/ConverterMain.java | 21 +-- .../compiler/DocumentsCompiler.java | 1 - .../converting/compiler/UrlsCompiler.java | 35 ++-- .../converting/model/ProcessedDomain.java | 13 +- .../processor/DocumentProcessor.java | 17 +- .../converting/processor/DomainProcessor.java | 156 ++++++++---------- .../processor/logic/FeatureExtractor.java | 5 +- .../AbstractDocumentProcessorPlugin.java | 8 +- .../plugin/HtmlDocumentProcessorPlugin.java | 7 +- .../PlainTextDocumentProcessorPlugin.java | 5 +- .../converting/ConvertingIntegrationTest.java | 23 ++- ...CrawlingThenConvertingIntegrationTest.java | 6 +- .../crawl/retreival/CrawlDataReference.java | 3 +- .../crawl/retreival/CrawlerRetreiver.java | 8 +- .../retreival/fetcher/HttpFetcherImpl.java | 4 +- .../retreival/CrawlerMockFetcherTest.java | 5 +- .../tools/experiments/AdblockExperiment.java | 2 +- .../experiments/DebugConverterExperiment.java | 2 +- .../SentenceStatisticsExperiment.java | 2 +- .../experiments/SiteStatisticsExperiment.java | 12 +- .../tools/experiments/TopicExperiment.java | 2 +- .../tools/TermFrequencyExtractor.java | 2 +- run/env/service.env | 2 +- 26 files changed, 194 insertions(+), 197 deletions(-) diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java index c3dddb3c..1753f7c9 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -31,11 +31,9 @@ public class CrawledDomainReader { public CrawledDomainReader() { } - public Iterator createIterator(Path basePath, CrawlingSpecification spec) throws IOException { + public Iterator createIterator(Path fullPath) throws IOException { - final var path = CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain); - - BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(path.toFile())))); + BufferedReader br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(fullPath.toFile())))); return new Iterator<>() { SerializableCrawlData next; @@ -71,6 +69,11 @@ public class CrawledDomainReader { } }; } + + public Iterator createIterator(Path basePath, CrawlingSpecification spec) throws IOException { + + return createIterator(CrawlerOutputFile.getOutputFile(basePath, spec.id, spec.domain)); + } public CrawledDomain read(Path path) throws IOException { DomainDataAssembler domainData = new DomainDataAssembler(); diff --git a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java index 0066ddf2..94d13235 100644 --- a/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/src/main/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -21,7 +21,7 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatusDesc; public String headers; - public BigString documentBody; + public String documentBody; public String documentBodyHash; public String canonicalUrl; @@ -35,9 +35,4 @@ public class CrawledDocument implements SerializableCrawlData { return SERIAL_IDENTIFIER; } - /** Remove all large data from this object to save memory */ - public void dispose() { - documentBody = null; - headers = null; - } } diff --git a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java index 655525d6..f1d71f37 100644 --- a/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java +++ b/code/process-models/crawling-model/src/main/java/plan/CrawlPlan.java @@ -5,22 +5,18 @@ import lombok.NoArgsConstructor; import lombok.ToString; import nu.marginalia.crawling.io.CrawledDomainReader; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.crawling.model.spec.CrawlerSpecificationLoader; import nu.marginalia.crawling.model.spec.CrawlingSpecification; import nu.marginalia.process.log.WorkLog; -import nu.marginalia.process.log.WorkLogEntry; -import org.jetbrains.annotations.NotNull; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.Iterator; -import java.util.function.Consumer; import java.util.function.Predicate; -import java.util.stream.Stream; import java.util.Optional; @AllArgsConstructor @NoArgsConstructor @ToString @@ -122,4 +118,30 @@ public class CrawlPlan { return reader.readOptionally(path); }); } + + + public Iterable> crawlDataIterable(Predicate idPredicate) { + final CrawledDomainReader reader = new CrawledDomainReader(); + + return WorkLog.iterableMap(crawl.getLogFile(), + entry -> { + if (!idPredicate.test(entry.id())) { + return Optional.empty(); + } + + var path = getCrawledFilePath(entry.path()); + + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + + try { + return Optional.of(reader.createIterator(path)); + } + catch (IOException ex) { + return Optional.empty(); + } + }); + } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java index 55c022ba..be617817 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/ConverterMain.java @@ -4,6 +4,7 @@ import com.google.gson.Gson; import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; @@ -23,12 +24,12 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.sql.SQLException; +import java.util.Iterator; import java.util.List; import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Predicate; import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; @@ -101,20 +102,14 @@ public class ConverterMain { int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); - var pipe = new ParallelPipe("Converter", 16, 4, 2) { + var pipe = new ParallelPipe, ProcessingInstructions>("Converter", 16, 4, 2) { @Override - protected ProcessingInstructions onProcess(CrawledDomain domainData) { - Thread.currentThread().setName("Converter:Processor["+domainData.domain+"] - " + domainData.size()); - try { - var processed = processor.process(domainData); - var compiled = compiler.compile(processed); + protected ProcessingInstructions onProcess(Iterator dataStream) { + var processed = processor.process(dataStream); + var compiled = compiler.compile(processed); - return new ProcessingInstructions(domainData.id, compiled); - } - finally { - Thread.currentThread().setName("Converter:Processor[IDLE]"); - } + return new ProcessingInstructions(processed.id, compiled); } @Override @@ -140,7 +135,7 @@ public class ConverterMain { processedDomains.set(processLog.countFinishedJobs()); heartbeat.setProgress(processedDomains.get() / (double) totalDomains); - for (var domain : plan.domainsIterable(id -> !processLog.isJobFinished(id))) + for (var domain : plan.crawlDataIterable(id -> !processLog.isJobFinished(id))) { pipe.accept(domain); } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java index 36b112fa..3849f015 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/DocumentsCompiler.java @@ -3,7 +3,6 @@ package nu.marginalia.converting.compiler; import nu.marginalia.converting.instruction.Instruction; import nu.marginalia.converting.instruction.instructions.LoadKeywords; import nu.marginalia.converting.instruction.instructions.LoadProcessedDocument; -import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWithError; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.crawl.HtmlFeature; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java index 4d05a35d..d5184cfc 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/compiler/UrlsCompiler.java @@ -6,6 +6,8 @@ import nu.marginalia.converting.instruction.instructions.LoadUrl; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.HashSet; @@ -15,30 +17,39 @@ import java.util.Set; public class UrlsCompiler { private static final int MAX_INTERNAL_LINKS = 25; + private final Logger logger = LoggerFactory.getLogger(getClass()); public void compile(List ret, List documents) { Set seenUrls = new HashSet<>(documents.size()*4); Set seenDomains = new HashSet<>(documents.size()); for (var doc : documents) { + if (doc.url == null) { + logger.warn("Discovered document with null URL"); + continue; + } + seenUrls.add(doc.url); - if (doc.details != null) { + if (doc.details == null) { + continue; + } - for (var url : doc.details.linksExternal) { - if (seenDomains.add(url.domain)) { - seenUrls.add(url); - } + // Add *some* external links; to avoid loading too many and gunking up the database with nonsense, + // only permit this once per external domain per crawled domain + for (var url : doc.details.linksExternal) { + if (seenDomains.add(url.domain)) { + seenUrls.add(url); } + } - if (doc.isOk()) { - // Don't load more than a few from linksInternal, grows too big for no reason - var linksToAdd = new ArrayList<>(doc.details.linksInternal); - if (linksToAdd.size() > MAX_INTERNAL_LINKS) { - linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); - } - seenUrls.addAll(linksToAdd); + if (doc.isOk()) { + // Don't load more than a few from linksInternal, grows too big for no reason + var linksToAdd = new ArrayList<>(doc.details.linksInternal); + if (linksToAdd.size() > MAX_INTERNAL_LINKS) { + linksToAdd.subList(MAX_INTERNAL_LINKS, linksToAdd.size()).clear(); } + seenUrls.addAll(linksToAdd); } } diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java index 95b66a02..e445d5b2 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/model/ProcessedDomain.java @@ -6,7 +6,6 @@ import nu.marginalia.model.crawl.DomainIndexingState; import java.util.List; import java.util.Optional; -import java.util.OptionalDouble; @ToString public class ProcessedDomain { @@ -16,17 +15,7 @@ public class ProcessedDomain { public DomainIndexingState state; public EdgeDomain redirect; public String ip; - - public OptionalDouble averageQuality() { - if (documents == null) { - return OptionalDouble.empty(); - } - return documents.stream() - .map(ProcessedDocument::quality) - .filter(OptionalDouble::isPresent) - .mapToDouble(OptionalDouble::getAsDouble) - .average(); - } + public String id; public int size() { return Optional.ofNullable(documents).map(List::size).orElse(1); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java index b7ac1767..82e9c5d7 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -2,7 +2,6 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; @@ -38,11 +37,14 @@ public class DocumentProcessor { processorPlugins.add(plainTextDocumentProcessorPlugin); } - public ProcessedDocument process(CrawledDocument crawledDocument, CrawledDomain crawledDomain) { + public ProcessedDocument process(CrawledDocument crawledDocument) { ProcessedDocument ret = new ProcessedDocument(); try { - processDocument(crawledDocument, crawledDomain, ret); + // We must always provide the URL, even if we don't process the document + ret.url = getDocumentUrl(crawledDocument); + + processDocument(crawledDocument, ret); } catch (DisqualifiedException ex) { ret.state = UrlIndexingState.DISQUALIFIED; @@ -53,13 +55,12 @@ public class DocumentProcessor { ret.state = UrlIndexingState.DISQUALIFIED; ret.stateReason = DisqualifiedException.DisqualificationReason.PROCESSING_EXCEPTION.toString(); logger.info("Failed to convert " + crawledDocument.url, ex); - ex.printStackTrace(); } return ret; } - private void processDocument(CrawledDocument crawledDocument, CrawledDomain crawledDomain, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -74,15 +75,11 @@ public class DocumentProcessor { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.CONTENT_TYPE); } - - ret.url = getDocumentUrl(crawledDocument); ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); final var plugin = findPlugin(crawledDocument); - AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDomain, crawledDocument); - - crawledDocument.dispose(); + AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument); ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java index dcdda943..64682319 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -1,18 +1,18 @@ package nu.marginalia.converting.processor; -import com.google.common.base.Strings; import com.google.inject.Inject; +import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.logic.links.LinkGraph; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.CrawlerDomainStatus; +import nu.marginalia.crawling.model.*; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.model.crawl.HtmlFeature; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.*; @@ -21,6 +21,8 @@ public class DomainProcessor { private final SiteWords siteWords; private final LshDocumentDeduplicator documentDeduplicator; + private final Logger logger = LoggerFactory.getLogger(getClass()); + @Inject public DomainProcessor(DocumentProcessor documentProcessor, SiteWords siteWords, @@ -30,44 +32,85 @@ public class DomainProcessor { this.documentDeduplicator = documentDeduplicator; } - public ProcessedDomain process(CrawledDomain crawledDomain) { + public ProcessedDomain process(Iterator dataStream) { var ret = new ProcessedDomain(); + List docs = new ArrayList<>(); + boolean cookies = false; + String ip = ""; + while (dataStream.hasNext()) { + var data = dataStream.next(); - ret.domain = new EdgeDomain(crawledDomain.domain); - ret.ip = crawledDomain.ip; + if (data instanceof CrawledDomain crawledDomain) { + ret.domain = new EdgeDomain(crawledDomain.domain); + ret.ip = crawledDomain.ip; + ret.id = crawledDomain.id; - if (crawledDomain.redirectDomain != null) { - ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); - } + cookies = Objects.requireNonNullElse(crawledDomain.cookies, Collections.emptyList()).size() > 0; + ip = crawledDomain.ip; - if (crawledDomain.doc != null) { - ret.documents = new ArrayList<>(crawledDomain.doc.size()); - - fixBadCanonicalTags(crawledDomain.doc); - - for (var doc : crawledDomain.doc) { - var processedDoc = documentProcessor.process(doc, crawledDomain); - - if (processedDoc.url != null) { - ret.documents.add(processedDoc); + if (crawledDomain.redirectDomain != null) { + ret.redirect = new EdgeDomain(crawledDomain.redirectDomain); } - + ret.documents = docs; + ret.state = getState(crawledDomain.crawlerStatus); } + else if (data instanceof CrawledDocument doc) { + try { + if (doc.url == null) + continue; + fixBadCanonicalTag(doc); - documentDeduplicator.deduplicate(ret.documents); - - calculateStatistics(ret); - } - else { - ret.documents = Collections.emptyList(); + docs.add(documentProcessor.process(doc)); + } + catch (Exception ex) { + logger.warn("Failed to process " + doc.url, ex); + } + } } - ret.state = getState(crawledDomain.crawlerStatus); + // Add late keywords and features from domain-level information + + List terms = new ArrayList<>(); + terms.add("ip:"+ip); + if (cookies) + terms.add(HtmlFeature.COOKIES.getKeyword()); + + for (var document : ret.documents) { + if (document.details == null) + continue; + + if (cookies) + document.details.features.add(HtmlFeature.COOKIES); + + document.words.addAllSyntheticTerms(terms); + } + + documentDeduplicator.deduplicate(ret.documents); + calculateStatistics(ret); return ret; } + private void fixBadCanonicalTag(CrawledDocument doc) { + // Some sites have a canonical tag that points to a different domain, + // but our loader can not support this, so we point these back to the + // original url. + + var canonicalOpt = EdgeUrl.parse(doc.canonicalUrl); + if (canonicalOpt.isEmpty()) return; + + var urlOpt = EdgeUrl.parse(doc.url); + if (urlOpt.isEmpty()) return; + + var urlActual = urlOpt.get(); + var canonicalActual = canonicalOpt.get(); + + if (!Objects.equals(urlActual.domain, canonicalActual.domain)) { + doc.canonicalUrl = doc.url; + } + } + private void calculateStatistics(ProcessedDomain ret) { LinkGraph linkGraph = new LinkGraph(); TopKeywords topKeywords = new TopKeywords(); @@ -91,61 +134,6 @@ public class DomainProcessor { siteWords.flagAdjacentWords(topKeywords, invertedLinkGraph, ret); } - - private void fixBadCanonicalTags(List docs) { - Map> seenCanonicals = new HashMap<>(); - Set seenUrls = new HashSet<>(); - - // Sometimes sites set a blanket canonical link to their root page - // this removes such links from consideration - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url)) { - seenCanonicals.computeIfAbsent(document.canonicalUrl, url -> new HashSet<>()).add(document.documentBodyHash); - } - seenUrls.add(document.url); - } - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url) - && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { - - if (seenUrls.add(document.canonicalUrl)) { - document.canonicalUrl = document.url; - } - else { - document.crawlerStatus = CrawlerDocumentStatus.BAD_CANONICAL.name(); - } - } - } - - for (var document : docs) { - if (!Strings.isNullOrEmpty(document.canonicalUrl) - && !Objects.equals(document.canonicalUrl, document.url) - && seenCanonicals.getOrDefault(document.canonicalUrl, Collections.emptySet()).size() > 1) { - document.canonicalUrl = document.url; - } - } - - // Ignore canonical URL if it points to a different domain - // ... this confuses the hell out of the loader - for (var document : docs) { - if (Strings.isNullOrEmpty(document.canonicalUrl)) - continue; - - Optional cUrl = EdgeUrl.parse(document.canonicalUrl); - Optional dUrl = EdgeUrl.parse(document.url); - - if (cUrl.isPresent() && dUrl.isPresent() - && !Objects.equals(cUrl.get().domain, dUrl.get().domain)) - { - document.canonicalUrl = document.url; - } - } - } - private DomainIndexingState getState(String crawlerStatus) { return switch (CrawlerDomainStatus.valueOf(crawlerStatus)) { case OK -> DomainIndexingState.ACTIVE; diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index 57a98879..c431e94b 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -65,7 +65,7 @@ public class FeatureExtractor { this.googleAnwersSpamDetector = googleAnwersSpamDetector; } - public Set getFeatures(CrawledDomain domain, Document doc, DocumentLanguageData dld) { + public Set getFeatures(Document doc, DocumentLanguageData dld) { final Set features = new HashSet<>(); final Elements scriptTags = doc.getElementsByTag("script"); @@ -279,9 +279,6 @@ public class FeatureExtractor { } } - if (!domain.cookies.isEmpty()) - features.add(HtmlFeature.COOKIES); - if (recipeDetector.testP(dld) > 0.5) features.add(HtmlFeature.CATEGORY_FOOD); // these should be mutually exclusive diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index c49d365f..14fd12ad 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -19,7 +19,7 @@ import java.util.*; public abstract class AbstractDocumentProcessorPlugin { protected LanguageFilter languageFilter = new LanguageFilter(); - public abstract DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; + public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException; public abstract boolean isApplicable(CrawledDocument doc); protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { @@ -44,12 +44,6 @@ public abstract class AbstractDocumentProcessorPlugin { tagWords.add(key + ":" + value.toString().toLowerCase()); } - public MetaTagsBuilder addDomainCrawlData(CrawledDomain domain) { - add("ip", domain.ip); - - return this; - } - public MetaTagsBuilder addUrl(EdgeUrl url) { add("proto", url.proto); add("site", url.domain); diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index c2119688..8fb2b801 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -94,10 +94,10 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } @Override - public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody.decode(); + String documentBody = crawledDocument.documentBody; if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); @@ -141,7 +141,7 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.QUALITY); } - final Set features = featureExtractor.getFeatures(crawledDomain, doc, dld); + final Set features = featureExtractor.getFeatures(doc, dld); ret.features = features; ret.hashCode = dld.localitySensitiveHashCode(); @@ -159,7 +159,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin ret.generator = generatorParts.type(); var tagWords = new MetaTagsBuilder() - .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) .addFeatures(features) diff --git a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index e7d0a9a1..1dac05f1 100644 --- a/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/src/main/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -55,10 +55,10 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP } @Override - public DetailsWithWords createDetails(CrawledDomain crawledDomain, CrawledDocument crawledDocument) + public DetailsWithWords createDetails(CrawledDocument crawledDocument) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody.decode(); + String documentBody = crawledDocument.documentBody; if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); @@ -97,7 +97,6 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); var tagWords = new MetaTagsBuilder() - .addDomainCrawlData(crawledDomain) .addPubDate(pubDate) .addUrl(url) .addFeatures(ret.features) diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java index 67aa5299..8cf3a397 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -9,6 +9,7 @@ import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.PubDate; @@ -40,18 +41,17 @@ public class ConvertingIntegrationTest { public void testEmptyDomain() { var docs = new ArrayList(); - var ret = domainProcessor.process( - new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", - docs, Collections.emptyList())); + var domain = new CrawledDomain("123", "memex.marginalia.nu", null, "OK", "-", "127.0.0.1", + docs, Collections.emptyList()); + var ret = domainProcessor.process(asSerializableCrawlData(domain)); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); assertTrue(ret.documents.isEmpty()); } - @Test public void testMemexMarginaliaNuDateInternalConsistency() throws IOException { - var ret = domainProcessor.process(readMarginaliaWorkingSet()); + var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); ret.documents.stream().filter(ProcessedDocument::isProcessedFully).forEach(doc -> { int year = PubDate.fromYearByte(doc.details.metadata.year()); Integer yearMeta = doc.details.pubYear; @@ -64,7 +64,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNu() throws IOException { - var ret = domainProcessor.process(readMarginaliaWorkingSet()); + var ret = domainProcessor.process(asSerializableCrawlData(readMarginaliaWorkingSet())); assertEquals(ret.state, DomainIndexingState.ACTIVE); assertEquals(ret.domain, new EdgeDomain("memex.marginalia.nu")); @@ -110,7 +110,7 @@ public class ConvertingIntegrationTest { "OK", "", "", - BigString.encode(readClassPathFile(p.toString())), + readClassPathFile(p.toString()), Double.toString(Math.random()), "https://memex.marginalia.nu/" + file, null, @@ -133,4 +133,13 @@ public class ConvertingIntegrationTest { return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes()); } + + private Iterator asSerializableCrawlData(CrawledDomain domain) { + List data = new ArrayList<>(); + if (domain.doc != null) { + data.addAll(domain.doc); + } + data.add(domain); + return data.iterator(); + } } diff --git a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 890a1081..9a79e9e9 100644 --- a/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/src/test/java/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -55,7 +55,11 @@ public class CrawlingThenConvertingIntegrationTest { CrawledDomain domain = crawl(specs); - var output = domainProcessor.process(domain); + List data = new ArrayList<>(); + data.add(domain); + data.addAll(domain.doc); + + var output = domainProcessor.process(data.iterator()); for (var doc : output.documents) { if (doc.isOk()) { diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 8f331a65..adb25752 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -43,8 +43,7 @@ public class CrawlDataReference { return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } - private long contentHash(BigString documentBody) { - String content = documentBody.decode(); + private long contentHash(String content) { EasyLSH hash = new EasyLSH(); int next = 0; diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ebdbd4f0..87251059 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -323,7 +323,7 @@ public class CrawlerRetreiver { return; // Sniff the software based on the sample document - var doc = Jsoup.parse(sample.documentBody.decode()); + var doc = Jsoup.parse(sample.documentBody); crawlFrontier.setLinkFilter(linkFilterSelector.selectFilter(doc)); for (var link : doc.getElementsByTag("link")) { @@ -400,11 +400,9 @@ public class CrawlerRetreiver { CrawledDocument doc = reference.replaceOn304(fetchedDoc); if (doc.documentBody != null) { - var decoded = doc.documentBody.decode(); + doc.documentBodyHash = createHash(doc.documentBody); - doc.documentBodyHash = createHash(decoded); - - var parsedDoc = Jsoup.parse(decoded); + var parsedDoc = Jsoup.parse(doc.documentBody); EdgeUrl url = new EdgeUrl(doc.url); findLinks(url, parsedDoc); diff --git a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 025c0aa9..02cba42c 100644 --- a/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/src/main/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -295,7 +295,7 @@ public class HttpFetcherImpl implements HttpFetcher { .canonicalUrl(canonical) .httpStatus(rsp.code()) .url(responseUrl.toString()) - .documentBody(BigString.encode(strData)) + .documentBody(strData) .build(); } @@ -402,7 +402,7 @@ public class HttpFetcherImpl implements HttpFetcher { private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { return robotsParser.parseContent(doc.url, - doc.documentBody.decode().getBytes(), + doc.documentBody.getBytes(), doc.contentType, userAgent); } diff --git a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ae8e4679..fee1d44a 100644 --- a/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/src/test/java/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -43,13 +43,12 @@ public class CrawlerMockFetcherTest { .contentType("text/html") .httpStatus(200) .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .documentBody(BigString.encode(documentData)) + .documentBody(documentData) .build()); } @SneakyThrows private void registerUrlClasspathData(EdgeUrl url, String path) { - var data = BigString.encode(CommonTestData.loadTestData(path)); mockData.put(url, CrawledDocument.builder() .crawlId("1") @@ -57,7 +56,7 @@ public class CrawlerMockFetcherTest { .contentType("text/html") .httpStatus(200) .crawlerStatus(CrawlerDocumentStatus.OK.name()) - .documentBody(data) + .documentBody(CommonTestData.loadTestData(path)) .build()); } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java index 4228ed6b..da2a9272 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/AdblockExperiment.java @@ -32,7 +32,7 @@ public class AdblockExperiment extends Experiment { } private void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody.decode()); + Document parsedDocument = Jsoup.parse(doc.documentBody); if (simulator.hasAds(parsedDocument)) { System.out.println(doc.url); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java index 452be709..3a318dc3 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java @@ -26,7 +26,7 @@ public class DebugConverterExperiment extends Experiment { for (var doc : domain.doc) { if (doc.documentBody == null) continue; - var parsed = Jsoup.parse(doc.documentBody.decode()); + var parsed = Jsoup.parse(doc.documentBody); var tagExtractor = new BlogSpecialization.BlogTagExtractor(); parsed.traverse(tagExtractor); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 7bf2f784..44f3cf18 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -41,7 +41,7 @@ public class SentenceStatisticsExperiment extends Experiment { for (var doc : domain.doc) { if (doc.documentBody == null) continue; - var parsed = Jsoup.parse(doc.documentBody.decode()); + var parsed = Jsoup.parse(doc.documentBody); parsed.body().filter(new DomPruningFilter(0.5)); diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index 3ac38b40..2882d0f2 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -31,12 +31,12 @@ public class SiteStatisticsExperiment extends Experiment { @Override public boolean process(CrawledDomain domain) { - var ret = domainProcessor.process(domain); - - ret.documents.stream() - .filter(ProcessedDocument::isProcessedFully) - .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) - .forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata)); +// var ret = domainProcessor.process(domain); +// +// ret.documents.stream() +// .filter(ProcessedDocument::isProcessedFully) +// .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) +// .forEach(doc -> System.out.println(doc.url + ":" + doc.details.metadata)); return true; } diff --git a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java index e70df91c..f81bbcd2 100644 --- a/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ b/code/tools/experiment-runner/src/main/java/nu/marginalia/tools/experiments/TopicExperiment.java @@ -37,7 +37,7 @@ public class TopicExperiment extends Experiment { for (var doc : domain.doc) { if (doc.documentBody == null) continue; - var parsed = Jsoup.parse(doc.documentBody.decode()); + var parsed = Jsoup.parse(doc.documentBody); parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); diff --git a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java index ece6a507..c5a52dd3 100644 --- a/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java +++ b/code/tools/term-frequency-extractor/src/main/java/nu/marginalia/tools/TermFrequencyExtractor.java @@ -58,7 +58,7 @@ public class TermFrequencyExtractor { continue; docCount.incrementAndGet(); - Document parsed = Jsoup.parse(doc.documentBody.decode()); + Document parsed = Jsoup.parse(doc.documentBody); parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.get().extractSentences(parsed); diff --git a/run/env/service.env b/run/env/service.env index 5553f603..ac745577 100644 --- a/run/env/service.env +++ b/run/env/service.env @@ -1,4 +1,4 @@ WMSA_HOME=run/ CONTROL_SERVICE_OPTS="-DdistPath=/dist" CONVERTER_OPTS="-ea -Xmx16G -XX:-CompactStrings -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" -CRAWLER_OPTS="-Dbigstring.disabled=true -Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file +CRAWLER_OPTS="-Xmx16G -XX:+UseParallelGC -XX:GCTimeRatio=14 -XX:ParallelGCThreads=15" \ No newline at end of file