From 3714104976061aefc84ab359c72f7c07965e6bde Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 17 Dec 2024 15:40:24 +0100 Subject: [PATCH] Add loader for slop data in converter. Also alter CrawledDocument to not require String parsing of the underlying byte[] data. This should reduce the number of large memory allocations quite significantly, hopefully reducing the GC churn a bit. --- .../processor/DocumentProcessor.java | 5 +- .../converting/processor/DomainProcessor.java | 183 +++++++++--------- .../processor/logic/DocumentValuator.java | 2 +- .../processor/logic/FeatureExtractor.java | 5 +- .../AbstractDocumentProcessorPlugin.java | 4 +- .../plugin/HtmlDocumentProcessorPlugin.java | 41 ++-- .../PlainTextDocumentProcessorPlugin.java | 2 +- .../sideload/SideloaderProcessing.java | 3 +- .../converting/ConvertingIntegrationTest.java | 4 +- .../marginalia/contenttype/ContentType.java | 31 +++ .../contenttype/DocumentBodyToString.java | 40 ++-- .../crawl/fetcher/warc/WarcRecorder.java | 6 +- .../crawl/retreival/CrawlDataReference.java | 8 +- .../crawl/retreival/CrawlerRetreiver.java | 9 +- .../retreival/revisit/CrawlerRevisitor.java | 17 +- .../revisit/DocumentWithReference.java | 12 +- .../io/SerializableCrawlDataStream.java | 42 +++- .../ParquetSerializableCrawlDataStream.java | 8 +- .../SlopSerializableCrawlDataStream.java | 17 +- .../model/body/DocumentBodyExtractor.java | 2 +- .../model/body/HttpFetchResult.java | 12 +- .../model/crawldata/CrawledDocument.java | 58 +++++- ...edDocumentParquetRecordFileWriterTest.java | 4 +- .../slop/SlopCrawlDataRecordTest.java | 97 +++++++--- .../retreival/fetcher/WarcRecorderTest.java | 4 +- .../retreival/CrawlerMockFetcherTest.java | 2 +- .../nu/marginalia/extractor/AtagExporter.java | 13 +- .../nu/marginalia/extractor/FeedExporter.java | 5 +- .../extractor/TermFrequencyExporter.java | 5 +- .../livecrawler/LiveCrawlDataSet.java | 16 +- .../livecrawler/LiveCrawlerMain.java | 2 +- .../livecrawler/LiveCrawlDataSetTest.java | 2 +- .../tools/ExperimentRunnerMain.java | 8 - .../tools/experiments/AdblockExperiment.java | 45 ----- .../experiments/DebugConverterExperiment.java | 46 ----- .../ExportExternalLinksExperiment.java | 71 ------- .../SentenceStatisticsExperiment.java | 70 ------- .../experiments/SiteStatisticsExperiment.java | 43 ---- .../tools/experiments/TestExperiment.java | 16 -- .../tools/experiments/TopicExperiment.java | 64 ------ .../test/nu/marginalia/IntegrationTest.java | 2 +- settings.gradle | 2 +- 42 files changed, 402 insertions(+), 626 deletions(-) delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java delete mode 100644 code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java index d1e4d495..62da1c63 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -19,6 +19,7 @@ import nu.marginalia.model.idx.WordFlags; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URISyntaxException; import java.util.ArrayList; import java.util.List; @@ -91,7 +92,7 @@ public class DocumentProcessor { DocumentClass documentClass, DocumentDecorator documentDecorator, DomainLinks externalDomainLinks, - ProcessedDocument ret) throws URISyntaxException, DisqualifiedException + ProcessedDocument ret) throws URISyntaxException, IOException, DisqualifiedException { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); @@ -109,7 +110,7 @@ public class DocumentProcessor { ret.state = crawlerStatusToUrlState(crawledDocument.crawlerStatus, crawledDocument.httpStatus); - final var plugin = findPlugin(crawledDocument); + AbstractDocumentProcessorPlugin plugin = findPlugin(crawledDocument); EdgeUrl url = new EdgeUrl(crawledDocument.url); LinkTexts linkTexts = anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, url); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index 619f917d..0da8db92 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -65,15 +65,15 @@ public class DomainProcessor { if (sizeHint > SIDELOAD_THRESHOLD) { // If the file is too big, we run a processing mode that doesn't // require loading the entire dataset into RAM - return sideloadProcessing(dataStream, sizeHint); + return simpleProcessing(dataStream, sizeHint); } return fullProcessing(dataStream); } - public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) { + public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) { try { - return new SideloadProcessing(dataStream, sizeHint, extraKeywords); + return new SimpleProcessing(dataStream, sizeHint, extraKeywords); } catch (Exception ex) { logger.warn("Failed to process domain sideload", ex); @@ -81,9 +81,9 @@ public class DomainProcessor { } } - public SideloadProcessing sideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) { + public SimpleProcessing simpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint) { try { - return new SideloadProcessing(dataStream, sizeHint); + return new SimpleProcessing(dataStream, sizeHint); } catch (Exception ex) { logger.warn("Failed to process domain sideload", ex); @@ -91,93 +91,6 @@ public class DomainProcessor { } } - public class SideloadProcessing implements ConverterBatchWritableIf, SideloadSource { - private final SerializableCrawlDataStream dataStream; - private final ProcessedDomain domain; - private final DocumentDecorator documentDecorator; - private final Set processedUrls = new HashSet<>(); - private final DomainLinks externalDomainLinks; - private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); - private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8, - Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors()) - ); - - SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { - this(dataStream, sizeHint, List.of()); - } - - SideloadProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) throws IOException { - this.dataStream = dataStream; - - if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain)) - { - throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); - } - - domain = new ProcessedDomain(); - domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; - - documentDecorator = new DocumentDecorator(); - documentDecorator.addTerms(extraKeywords); - - processDomain(crawledDomain, domain, documentDecorator); - - externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); - } - - @Override - public ProcessedDomain getDomain() { - return domain; - } - - @Override - public Iterator getDocumentsStream() { - return iteratorFactory.create((taskConsumer) -> { - while (dataStream.hasNext()) - { - if (!(dataStream.next() instanceof CrawledDocument doc)) - continue; - if (doc.url == null || !processedUrls.add(doc.url)) - continue; - - - taskConsumer.accept(() -> { - var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator); - - synchronized (deduplicator) { - deduplicator.markIfDuplicate(processedDoc); - } - - if (processedDoc.isProcessedFully()) { - // This is a bit sketchy, but we need to set the size and topology to something - processedDoc.details.metadata = processedDoc.details.metadata.withSizeAndTopology( - 10_000, externalDomainLinks.countForUrl(processedDoc.url)); - } - - return processedDoc; - }); - } - }); - } - - @Override - public void write(ConverterBatchWriter writer) throws IOException { - writer.writeSideloadSource(this); - } - - @Override - public String id() { - return domain.domain.toString(); - } - - @Override - public void close() throws Exception { - dataStream.close(); - deduplicator.close(); - } - } - - @Nullable public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) { try { @@ -209,7 +122,7 @@ public class DomainProcessor { continue; if (doc.url == null) continue; - if (doc.documentBody.isBlank()) + if (doc.documentBodyBytes.length == 0) continue; if (!processedUrls.add(doc.url)) continue; @@ -236,6 +149,90 @@ public class DomainProcessor { } } + /** The simple processing track processes documents individually, and does not perform any domain-level analysis. + * This is needed to process extremely large domains, which would otherwise eat up too much RAM. + */ + public class SimpleProcessing implements ConverterBatchWritableIf, SideloadSource { + private final SerializableCrawlDataStream dataStream; + private final ProcessedDomain domain; + private final DocumentDecorator documentDecorator; + private final Set processedUrls = new HashSet<>(); + private final DomainLinks externalDomainLinks; + private final LshDocumentDeduplicator deduplicator = new LshDocumentDeduplicator(); + private static final ProcessingIterator.Factory iteratorFactory = ProcessingIterator.factory(8, + Integer.getInteger("java.util.concurrent.ForkJoinPool.common.parallelism", Runtime.getRuntime().availableProcessors()) + ); + + SimpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint) throws IOException { + this(dataStream, sizeHint, List.of()); + } + + SimpleProcessing(SerializableCrawlDataStream dataStream, int sizeHint, Collection extraKeywords) throws IOException { + this.dataStream = dataStream; + + if (!dataStream.hasNext() || !(dataStream.next() instanceof CrawledDomain crawledDomain)) + { + throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); + } + + domain = new ProcessedDomain(); + domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; + + documentDecorator = new DocumentDecorator(); + documentDecorator.addTerms(extraKeywords); + + processDomain(crawledDomain, domain, documentDecorator); + + externalDomainLinks = anchorTagsSource.getAnchorTags(domain.domain); + } + + @Override + public ProcessedDomain getDomain() { + return domain; + } + + @Override + public Iterator getDocumentsStream() { + return dataStream.map((next) -> { + if (!(next instanceof CrawledDocument doc)) + return Optional.empty(); + + if (doc.url == null || !processedUrls.add(doc.url)) + return Optional.empty(); + + var processedDoc = documentProcessor.process(doc, domain.domain, externalDomainLinks, documentDecorator); + + synchronized (deduplicator) { + deduplicator.markIfDuplicate(processedDoc); + } + + if (processedDoc.isProcessedFully()) { + // This is a bit sketchy, but we need to set the size and topology to something + processedDoc.details.metadata = processedDoc.details.metadata.withSizeAndTopology( + 10_000, externalDomainLinks.countForUrl(processedDoc.url)); + } + + return Optional.of(processedDoc); + }); + } + + @Override + public void write(ConverterBatchWriter writer) throws IOException { + writer.writeSideloadSource(this); + } + + @Override + public String id() { + return domain.domain.toString(); + } + + @Override + public void close() throws Exception { + dataStream.close(); + deduplicator.close(); + } + } + private void processDomain(CrawledDomain crawledDomain, ProcessedDomain domain, DocumentDecorator decorator) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index 1c959dee..c96602e8 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -24,7 +24,7 @@ public class DocumentValuator { double scriptPenalty = getScriptPenalty(parsedDocument); double chatGptPenalty = getChatGptContentFarmPenalty(parsedDocument); - int rawLength = crawledDocument.documentBody.length(); + int rawLength = crawledDocument.documentBodyBytes.length; if (textLength == 0) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LENGTH); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index f2bac097..b1a8bbb4 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -218,7 +218,10 @@ public class FeatureExtractor { } } - if (features.contains(HtmlFeature.JS) && adblockSimulator.hasAds(doc.clone())) { + if (features.contains(HtmlFeature.JS) + // remove while disabled to get rid of expensive clone() call: + // adblockSimulator.hasAds(doc.clone()) + ) { features.add(HtmlFeature.ADVERTISEMENT); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index b03468ca..36a76ab3 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -14,6 +14,7 @@ import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.html.HtmlStandard; import javax.annotation.Nullable; +import java.io.IOException; import java.net.URISyntaxException; import java.util.HashSet; import java.util.List; @@ -25,7 +26,7 @@ public abstract class AbstractDocumentProcessorPlugin { this.languageFilter = languageFilter; } - public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException; + public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException, IOException; public abstract boolean isApplicable(CrawledDocument doc); protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { @@ -86,6 +87,7 @@ public abstract class AbstractDocumentProcessorPlugin { return this; } + public MetaTagsBuilder addPubDate(PubDate pubDate) { if (pubDate.year() > 1900) { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 94056f03..b5201dbd 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -6,6 +6,7 @@ import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.converting.processor.AcceptableAds; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.MetaRobotsTag; import nu.marginalia.converting.processor.logic.*; @@ -32,11 +33,11 @@ import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.IOException; import java.net.URISyntaxException; import java.util.EnumSet; import java.util.HashSet; @@ -51,7 +52,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin private final double minDocumentQuality; private final FeatureExtractor featureExtractor; - private final TitleExtractor titleExtractor; private final DocumentKeywordExtractor keywordExtractor; private final PubDateSniffer pubDateSniffer; @@ -74,7 +74,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin @Named("min-document-quality") Double minDocumentQuality, LanguageFilter languageFilter, FeatureExtractor featureExtractor, - TitleExtractor titleExtractor, DocumentKeywordExtractor keywordExtractor, PubDateSniffer pubDateSniffer, DocumentLengthLogic documentLengthLogic, @@ -89,7 +88,6 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin this.minDocumentQuality = minDocumentQuality; this.featureExtractor = featureExtractor; - this.titleExtractor = titleExtractor; this.keywordExtractor = keywordExtractor; this.pubDateSniffer = pubDateSniffer; this.metaRobotsTag = metaRobotsTag; @@ -108,19 +106,17 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin public DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, DocumentClass documentClass) - throws DisqualifiedException, URISyntaxException { + throws DisqualifiedException, URISyntaxException, IOException { - String documentBody = crawledDocument.documentBody; - - if (languageFilter.isBlockedUnicodeRange(documentBody)) { + if (languageFilter.isBlockedUnicodeRange(crawledDocument.documentBody(512))) { throw new DisqualifiedException(DisqualificationReason.LANGUAGE); } - if (documentBody.length() > MAX_DOCUMENT_LENGTH_BYTES) { // 128kb - documentBody = documentBody.substring(0, MAX_DOCUMENT_LENGTH_BYTES); - } + Document doc = crawledDocument.parseBody(); - Document doc = Jsoup.parse(documentBody); + if (AcceptableAds.hasAcceptableAdsTag(doc)) { + throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.ACCEPTABLE_ADS); + } if (!metaRobotsTag.allowIndexingByMetaTag(doc)) { throw new DisqualifiedException(DisqualificationReason.FORBIDDEN); @@ -138,32 +134,33 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } var prunedDoc = specialization.prune(doc); - DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc); - checkDocumentLanguage(dld); - - var ret = new ProcessedDocumentDetails(); final int length = getLength(doc); final HtmlStandard standard = getHtmlStandard(doc); final double quality = documentValuator.getQuality(crawledDocument, standard, doc, length); + if (isDisqualified(documentClass, url, quality, doc.title())) { + throw new DisqualifiedException(DisqualificationReason.QUALITY); + } + + DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc); + + checkDocumentLanguage(dld); + documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier()); + + var ret = new ProcessedDocumentDetails(); + ret.length = length; ret.standard = standard; ret.title = specialization.getTitle(doc, dld, crawledDocument.url); - documentLengthLogic.validateLength(dld, specialization.lengthModifier() * documentClass.lengthLimitModifier()); - final Set features = featureExtractor.getFeatures(url, doc, documentHeaders, dld); ret.features = features; ret.quality = documentValuator.adjustQuality(quality, features); ret.hashCode = dld.localitySensitiveHashCode(); - if (isDisqualified(documentClass, url, quality, ret.title)) { - throw new DisqualifiedException(DisqualificationReason.QUALITY); - } - PubDate pubDate = pubDateSniffer.getPubDate(documentHeaders, url, doc, standard, true); EnumSet documentFlags = documentFlags(features, generatorParts.type()); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 2007a5ed..838fa6c6 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -71,7 +71,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP DocumentClass documentClass) throws DisqualifiedException, URISyntaxException { - String documentBody = crawledDocument.documentBody; + String documentBody = crawledDocument.documentBody(); if (languageFilter.isBlockedUnicodeRange(documentBody)) { throw new DisqualifiedException(DisqualifiedException.DisqualificationReason.LANGUAGE); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 95729851..3418fcbe 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -19,6 +19,7 @@ import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; import java.time.LocalDateTime; import java.util.EnumSet; import java.util.List; @@ -50,7 +51,7 @@ public class SideloaderProcessing { "OK", "NP", "", - body, + body.getBytes(StandardCharsets.UTF_8), false, null, null diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java index 7cc451b4..0e5bbf39 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -98,7 +98,7 @@ public class ConvertingIntegrationTest { @Test public void testMemexMarginaliaNuSideloadProcessing() throws IOException { - var ret = domainProcessor.sideloadProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100); + var ret = domainProcessor.simpleProcessing(asSerializableCrawlData(readMarginaliaWorkingSet()), 100); assertNotNull(ret); assertEquals("memex.marginalia.nu", ret.id()); @@ -146,7 +146,7 @@ public class ConvertingIntegrationTest { "OK", "", "", - readClassPathFile(p.toString()), + readClassPathFile(p.toString()).getBytes(), false, null, null diff --git a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentType.java b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentType.java index 86b6aa68..35f47b5c 100644 --- a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentType.java +++ b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentType.java @@ -2,6 +2,11 @@ package nu.marginalia.contenttype; import org.apache.commons.lang3.StringUtils; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; + /** Content type and charset of a document * @param contentType The content type, e.g. "text/html" * @param charset The charset, e.g. "UTF-8" @@ -18,6 +23,32 @@ public record ContentType(String contentType, String charset) { return new ContentType(contentType, charset); } + public Charset asCharset() { + try { + int eqAt = charset.indexOf('='); + String charset = this.charset; + if (eqAt >= 0) { + charset = charset.substring(eqAt + 1); + } + if (Charset.isSupported(charset)) { + return Charset.forName(charset); + } else { + return StandardCharsets.UTF_8; + } + } + catch (IllegalCharsetNameException ex) { + // Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe? + return StandardCharsets.UTF_8; + } + catch (UnsupportedCharsetException ex) { + // This is usually like Macintosh Latin + // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) + // + // It's close enough to 8859-1 to serve + return StandardCharsets.ISO_8859_1; + } + } + public boolean is(String contentType) { return this.contentType.equalsIgnoreCase(contentType); } diff --git a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java index 8187871e..22949489 100644 --- a/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java +++ b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java @@ -1,9 +1,12 @@ package nu.marginalia.contenttype; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; + +import java.io.ByteArrayInputStream; +import java.io.IOException; import java.nio.charset.Charset; -import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.StandardCharsets; -import java.nio.charset.UnsupportedCharsetException; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; @@ -23,24 +26,25 @@ public class DocumentBodyToString { return new String(data, charset); } + public static Document getParsedData(ContentType type, byte[] data, String url) throws IOException { + final Charset charset; + + if (type.charset() == null || type.charset().isBlank()) { + charset = StandardCharsets.UTF_8; + } else { + charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset); + } + + ByteArrayInputStream bais = new ByteArrayInputStream(data); + + return Jsoup.parse(bais, charset.name(), url); + } + private static Charset computeCharset(ContentType type) { - try { - if (type.charset() == null || type.charset().isBlank()) - return StandardCharsets.UTF_8; - else { - return Charset.forName(type.charset()); - } - } - catch (IllegalCharsetNameException ex) { - // Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe? + if (type.charset() == null || type.charset().isBlank()) return StandardCharsets.UTF_8; - } - catch (UnsupportedCharsetException ex) { - // This is usually like Macintosh Latin - // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) - // - // It's close enough to 8859-1 to serve - return StandardCharsets.ISO_8859_1; + else { + return type.asCharset(); } } } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java index 2c099e30..a2cfa4e6 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/fetcher/warc/WarcRecorder.java @@ -184,7 +184,7 @@ public class WarcRecorder implements AutoCloseable { writer.write(item); } - private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags contentTags) { + private void saveOldResponse(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags contentTags) { try { WarcDigestBuilder responseDigestBuilder = new WarcDigestBuilder(); WarcDigestBuilder payloadDigestBuilder = new WarcDigestBuilder(); @@ -194,7 +194,7 @@ public class WarcRecorder implements AutoCloseable { if (documentBody == null) { bytes = new byte[0]; } else { - bytes = documentBody.getBytes(); + bytes = documentBody; } // Create a synthesis of custom headers and the original headers @@ -263,7 +263,7 @@ public class WarcRecorder implements AutoCloseable { * an E-Tag or Last-Modified header, and the server responds with a 304 Not Modified. In this * scenario we want to record the data as it was in the previous crawl, but not re-fetch it. */ - public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, String documentBody, @Nullable String headers, ContentTags ctags) { + public void writeReferenceCopy(EdgeUrl url, String contentType, int statusCode, byte[] documentBody, @Nullable String headers, ContentTags ctags) { saveOldResponse(url, contentType, statusCode, documentBody, headers, ctags); } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 98133bcf..8e1838a9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -58,7 +58,7 @@ public class CrawlDataReference implements AutoCloseable { return null; } - public static boolean isContentBodySame(String one, String other) { + public static boolean isContentBodySame(byte[] one, byte[] other) { final long contentHashOne = contentHash(one); final long contentHashOther = contentHash(other); @@ -66,7 +66,7 @@ public class CrawlDataReference implements AutoCloseable { return EasyLSH.hammingDistance(contentHashOne, contentHashOther) < 4; } - private static long contentHash(String content) { + private static long contentHash(byte[] content) { EasyLSH hash = new EasyLSH(); int next = 0; @@ -74,8 +74,8 @@ public class CrawlDataReference implements AutoCloseable { // In a naive best-effort fashion, extract the text // content of the document and feed it into the LSH - for (int i = 0; i < content.length(); i++) { - char c = content.charAt(i); + for (byte b : content) { + char c = (char) b; if (c == '<') { isInTag = true; } else if (c == '>') { diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index ace2059b..df5cd098 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -17,7 +17,6 @@ import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; -import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -283,14 +282,14 @@ public class CrawlerRetreiver implements AutoCloseable { else if (fetchedDoc instanceof HttpFetchResult.Result304Raw && reference.doc() != null) { var doc = reference.doc(); - warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBody, doc.headers, contentTags); + warcRecorder.writeReferenceCopy(top, doc.contentType, doc.httpStatus, doc.documentBodyBytes, doc.headers, contentTags); fetchedDoc = new HttpFetchResult.Result304ReplacedWithReference(doc.url, new ContentType(doc.contentType, "UTF-8"), - doc.documentBody); + doc.documentBodyBytes); - if (doc.documentBody != null) { - var parsed = Jsoup.parse(doc.documentBody); + if (doc.documentBodyBytes != null) { + var parsed = doc.parseBody(); crawlFrontier.enqueueLinksFromDocument(top, parsed); crawlFrontier.addVisited(top); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 7f3c6c22..67835eeb 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -1,6 +1,5 @@ package nu.marginalia.crawl.retreival.revisit; -import com.google.common.base.Strings; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.fetcher.warc.WarcRecorder; @@ -11,7 +10,8 @@ import nu.marginalia.crawl.retreival.DomainCrawlFrontier; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.crawldata.CrawledDocument; -import org.jsoup.Jsoup; + +import java.io.IOException; /** This class encapsulates the logic for re-visiting a domain that has already been crawled. * We may use information from the previous crawl to inform the next crawl, specifically the @@ -70,7 +70,7 @@ public class CrawlerRevisitor { // unlikely to produce anything meaningful for us. if (doc.httpStatus != 200) continue; - if (Strings.isNullOrEmpty(doc.documentBody)) + if (!doc.hasBody()) continue; if (!crawlFrontier.filterLink(url)) @@ -117,14 +117,19 @@ public class CrawlerRevisitor { // fashion to make sure we eventually catch changes over time // and ensure we discover new links - // Hoover up any links from the document - crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody)); + try { + // Hoover up any links from the document + crawlFrontier.enqueueLinksFromDocument(url, doc.parseBody()); + } + catch (IOException ex) { + // + } // Add a WARC record so we don't repeat this warcRecorder.writeReferenceCopy(url, doc.contentType, doc.httpStatus, - doc.documentBody, + doc.documentBodyBytes, doc.headers, new ContentTags(doc.etagMaybe, doc.lastModifiedMaybe) ); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index c24ef754..83a775f0 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -2,8 +2,6 @@ package nu.marginalia.crawl.retreival.revisit; import nu.marginalia.crawl.fetcher.ContentTags; import nu.marginalia.crawl.retreival.CrawlDataReference; -import nu.marginalia.model.body.DocumentBodyExtractor; -import nu.marginalia.model.body.DocumentBodyResult; import nu.marginalia.model.body.HttpFetchResult; import nu.marginalia.model.crawldata.CrawledDocument; @@ -35,21 +33,17 @@ public record DocumentWithReference( return false; if (doc == null) return false; - if (doc.documentBody == null) + if (doc.documentBodyBytes.length == 0) return false; - if (!(DocumentBodyExtractor.asString(resultOk) instanceof DocumentBodyResult.Ok bodyOk)) { - return false; - } - - return CrawlDataReference.isContentBodySame(doc.documentBody, bodyOk.body()); + return CrawlDataReference.isContentBodySame(doc.documentBodyBytes, resultOk.bytesRaw()); } public ContentTags getContentTags() { if (null == doc) return ContentTags.empty(); - if (doc.documentBody == null || doc.httpStatus != 200) + if (doc.documentBodyBytes.length == 0 || doc.httpStatus != 200) return ContentTags.empty(); String lastmod = doc.getLastModified(); diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java index 1d7006d9..2fdc6d2d 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/SerializableCrawlDataStream.java @@ -2,10 +2,14 @@ package nu.marginalia.io; import nu.marginalia.model.crawldata.SerializableCrawlData; import org.jetbrains.annotations.Nullable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Path; import java.util.Iterator; +import java.util.Optional; +import java.util.function.Function; /** Closable iterator exceptional over serialized crawl data * The data may appear in any order, and the iterator must be closed. @@ -13,7 +17,7 @@ import java.util.Iterator; * @see CrawledDomainReader * */ public interface SerializableCrawlDataStream extends AutoCloseable { - + Logger logger = LoggerFactory.getLogger(SerializableCrawlDataStream.class); SerializableCrawlData next() throws IOException; @@ -26,6 +30,41 @@ public interface SerializableCrawlDataStream extends AutoCloseable { @Nullable default Path path() { return null; } + default Iterator map(Function> mapper) { + return new Iterator<>() { + T next = null; + + public boolean hasNext() { + if (next != null) + return true; + try { + while (SerializableCrawlDataStream.this.hasNext()) { + var val = mapper.apply(SerializableCrawlDataStream.this.next()); + if (val.isPresent()) { + next = val.get(); + return true; + } + } + } + catch (IOException ex) { + logger.error("Error during stream", ex); + } + + return false; + } + + public T next() { + if (next == null && !hasNext()) + throw new IllegalStateException("No more data to read"); + + T ret = next; + next = null; + return ret; + } + }; + + } + // Dummy iterator over nothing static SerializableCrawlDataStream empty() { return new SerializableCrawlDataStream() { @@ -46,7 +85,6 @@ public interface SerializableCrawlDataStream extends AutoCloseable { public boolean hasNext() { return iterator.hasNext(); } public void close() {} }; - } } diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index b8dea5bc..4a970384 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -1,7 +1,6 @@ package nu.marginalia.io.crawldata.format; import nu.marginalia.contenttype.ContentType; -import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; @@ -18,6 +17,7 @@ import java.nio.file.Path; import java.util.*; import java.util.stream.Stream; +@Deprecated public class ParquetSerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { private static final Logger logger = LoggerFactory.getLogger(ParquetSerializableCrawlDataStream.class); @@ -124,9 +124,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial } else if (nextRecord.body != null) { try { - bodyString = DocumentBodyToString.getStringData( - ContentType.parse(nextRecord.contentType), - nextRecord.body); + ContentType.parse(nextRecord.contentType); } catch (Exception ex) { logger.error("Failed to convert body to string", ex); status = CrawlerDocumentStatus.BAD_CHARSET; @@ -147,7 +145,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial status.toString(), "", nextRecord.headers, - bodyString, + nextRecord.body, // this field isn't actually used, maybe we can skip calculating it? nextRecord.cookies, lastModified, diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java index 08d92045..5d0f7ff8 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/SlopSerializableCrawlDataStream.java @@ -1,7 +1,6 @@ package nu.marginalia.io.crawldata.format; import nu.marginalia.contenttype.ContentType; -import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawldata.*; @@ -36,7 +35,14 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa reader = new SlopCrawlDataRecord.FilteringReader(file) { @Override public boolean filter(String url, int status, String contentType) { - return contentType.toLowerCase().startsWith("text/"); + String ctLc = contentType.toLowerCase(); + + if (ctLc.startsWith("text/")) + return true; + else if (ctLc.startsWith("x-marginalia/")) + return true; + + return false; } }; } @@ -121,7 +127,6 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa } private void createDocumentRecord(SlopCrawlDataRecord nextRecord) { - String bodyString = ""; CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; if (nextRecord.contentType().startsWith("x-marginalia/advisory;state=content-type-failed-probe")) { @@ -136,9 +141,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa } else if (nextRecord.body() != null) { try { - bodyString = DocumentBodyToString.getStringData( - ContentType.parse(nextRecord.contentType()), - nextRecord.body()); + ContentType.parse(nextRecord.contentType()); } catch (Exception ex) { logger.error("Failed to convert body to string", ex); status = CrawlerDocumentStatus.BAD_CHARSET; @@ -156,7 +159,7 @@ public class SlopSerializableCrawlDataStream implements AutoCloseable, Serializa status.toString(), "", nextRecord.headers(), - bodyString, + nextRecord.body(), // this field isn't actually used, maybe we can skip calculating it? nextRecord.cookies(), null, diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java index ebd3d33e..ceb71545 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java @@ -18,7 +18,7 @@ public class DocumentBodyExtractor { return asBytes(fetchOk); } else if (result instanceof HttpFetchResult.Result304ReplacedWithReference retained) { - return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body().getBytes()); + return new DocumentBodyResult.Ok<>(retained.contentType(), retained.body()); } return new DocumentBodyResult.Error<>(CrawlerDocumentStatus.ERROR, "Fetch Result Not Ok"); diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java index d3fd41b0..c34c9327 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java @@ -114,20 +114,10 @@ public sealed interface HttpFetchResult { * * @see Result304Raw for the case where the document has not yet been replaced with the reference data. */ - record Result304ReplacedWithReference(String url, ContentType contentType, String body) implements HttpFetchResult { - + record Result304ReplacedWithReference(String url, ContentType contentType, byte[] body) implements HttpFetchResult { public boolean isOk() { return true; } - - public Optional parseDocument() { - try { - return Optional.of(Jsoup.parse(body)); - } - catch (Exception ex) { - return Optional.empty(); - } - } } /** Fetching resulted in an exception */ diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index 656e4b0f..86fcb6d2 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -1,8 +1,16 @@ package nu.marginalia.model.crawldata; +import nu.marginalia.contenttype.ContentType; +import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.model.EdgeUrl; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; +import org.jsoup.nodes.Document; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Objects; public final class CrawledDocument implements SerializableCrawlData { public String crawlId; @@ -19,8 +27,37 @@ public final class CrawledDocument implements SerializableCrawlData { @Nullable public String headers; - public String documentBody; + public String documentBody() { + return DocumentBodyToString.getStringData( + ContentType.parse(contentType), + documentBodyBytes); + } + public Document parseBody() throws IOException { + return DocumentBodyToString.getParsedData( + ContentType.parse(contentType), + documentBodyBytes, + url); + } + + public boolean hasBody() { + return documentBodyBytes.length > 0; + } + + public String documentBody(int sampleSize) { + if (sampleSize >= documentBodyBytes.length) { + return documentBody(); + } + + byte[] bytes = new byte[sampleSize]; + System.arraycopy(documentBodyBytes, 0, bytes, 0, sampleSize); + + return DocumentBodyToString.getStringData( + ContentType.parse(contentType), + bytes); + } + + public byte[] documentBodyBytes; /** * This is not guaranteed to be set in all versions of the format, * information may come in CrawledDomain instead @@ -30,7 +67,7 @@ public final class CrawledDocument implements SerializableCrawlData { public String lastModifiedMaybe; public String etagMaybe; - public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { + public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, byte[] documentBodyBytes, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { this.crawlId = crawlId; this.url = url; this.contentType = contentType; @@ -39,7 +76,7 @@ public final class CrawledDocument implements SerializableCrawlData { this.crawlerStatus = crawlerStatus; this.crawlerStatusDesc = crawlerStatusDesc; this.headers = headers; - this.documentBody = documentBody; + this.documentBodyBytes = Objects.requireNonNullElse(documentBodyBytes, new byte[] {}); this.hasCookies = hasCookies; this.lastModifiedMaybe = lastModifiedMaybe; this.etagMaybe = etagMaybe; @@ -106,7 +143,7 @@ public final class CrawledDocument implements SerializableCrawlData { } public String toString() { - return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; + return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + documentBody() + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; } public static class CrawledDocumentBuilder { @@ -118,7 +155,7 @@ public final class CrawledDocument implements SerializableCrawlData { private String crawlerStatus; private String crawlerStatusDesc; private @Nullable String headers; - private String documentBody; + private byte[] documentBodyBytes = new byte[0]; private String recrawlState; private Boolean hasCookies; private String lastModifiedMaybe; @@ -168,10 +205,13 @@ public final class CrawledDocument implements SerializableCrawlData { } public CrawledDocumentBuilder documentBody(String documentBody) { - this.documentBody = documentBody; + this.documentBodyBytes = documentBody.getBytes(StandardCharsets.UTF_8); + return this; + } + public CrawledDocumentBuilder documentBodyBytes(byte[] documentBodyBytes) { + this.documentBodyBytes = documentBodyBytes; return this; } - @Deprecated public CrawledDocumentBuilder recrawlState(String recrawlState) { this.recrawlState = recrawlState; @@ -194,11 +234,11 @@ public final class CrawledDocument implements SerializableCrawlData { } public CrawledDocument build() { - return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); + return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBodyBytes, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); } public String toString() { - return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; + return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBodyBytes=" + Arrays.toString(this.documentBodyBytes) + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; } } } diff --git a/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index cef2a0f2..9809078d 100644 --- a/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -80,7 +80,7 @@ class CrawledDocumentParquetRecordFileWriterTest { var document = (CrawledDocument) secondItem; assertEquals("https://www.marginalia.nu/", document.url); assertEquals("text/html", document.contentType); - assertEquals("hello world", document.documentBody); + assertEquals("hello world", document.documentBody()); assertEquals(200, document.httpStatus); } @@ -103,7 +103,7 @@ class CrawledDocumentParquetRecordFileWriterTest { System.out.println(doc.url); System.out.println(doc.contentType); System.out.println(doc.httpStatus); - System.out.println(doc.documentBody.length()); + System.out.println(doc.documentBody().length()); } } } catch (IOException e) { diff --git a/code/processes/crawling-process/model/test/nu/marginalia/slop/SlopCrawlDataRecordTest.java b/code/processes/crawling-process/model/test/nu/marginalia/slop/SlopCrawlDataRecordTest.java index e1fb2799..d2f84c30 100644 --- a/code/processes/crawling-process/model/test/nu/marginalia/slop/SlopCrawlDataRecordTest.java +++ b/code/processes/crawling-process/model/test/nu/marginalia/slop/SlopCrawlDataRecordTest.java @@ -1,10 +1,11 @@ package nu.marginalia.slop; -import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.contenttype.ContentType; +import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; +import java.io.ByteArrayInputStream; import java.io.IOException; -import java.nio.file.Files; import java.nio.file.Path; import java.time.Duration; import java.time.Instant; @@ -14,39 +15,71 @@ class SlopCrawlDataRecordTest { @Test public void test() throws IOException { - Files.deleteIfExists(Path.of("/tmp/steam.slop.zip")); - SlopCrawlDataRecord.convertFromParquet( - Path.of("/home/vlofgren/Downloads/_storage_crawl-data__23-10-21T15_08_43.750_3b_41_3b41e714-store.steampowered.com.parquet"), - Path.of("/tmp/steam.slop.zip") - ); +// Files.deleteIfExists(Path.of("/tmp/steam.slop.zip")); +// SlopCrawlDataRecord.convertFromParquet( +// Path.of("/home/vlofgren/Downloads/_storage_crawl-data__23-10-21T15_08_43.750_3b_41_3b41e714-store.steampowered.com.parquet"), +// Path.of("/tmp/steam.slop.zip") +// ); +// long st = 0; +// try (var reader = new SlopCrawlDataRecord.FilteringReader(Path.of("/tmp/steam.slop.zip")) { +// public boolean filter(String url, int status, String contentType) { +// return contentType.startsWith("text/html"); +// } +// }) { +// Instant start = Instant.now(); +// while (reader.hasRemaining()) { +// var next = reader.get(); +// byte[] body = next.body(); +// +// st += Jsoup.parse(new String(body)).title().length(); +// } +// System.out.println(st + " " + Duration.between(start, Instant.now())); +// } - System.out.println("BEGIN Slop"); - for (int i = 0; i < 6; i++) { - int sz = 0; - Instant start = Instant.now(); - try (var reader = new SlopCrawlDataRecord.Reader(Path.of("/tmp/steam.slop.zip")) { - public boolean filter(String url, int status, String contentType) { - return contentType.startsWith("text/html"); - } - }) { - while (reader.hasRemaining()) { - sz += reader.get().httpStatus(); - } + long st = 0; + try (var reader = new SlopCrawlDataRecord.FilteringReader(Path.of("/tmp/steam.slop.zip")) { + public boolean filter(String url, int status, String contentType) { + return contentType.startsWith("text/html"); } - Instant end = Instant.now(); - System.out.println("END Iter " + sz + " " + Duration.between(start, end)); - } - System.out.println("END Slop"); - - System.out.println("BEGIN Parquet"); - for (int i = 0; i < 6; i++) { + }) { Instant start = Instant.now(); - int sz = CrawledDocumentParquetRecordFileReader.stream(Path.of("/home/vlofgren/Downloads/_storage_crawl-data__23-10-21T15_08_43.750_3b_41_3b41e714-store.steampowered.com.parquet")) - .filter(record -> record.contentType.startsWith("text/html")) - .mapToInt(record -> record.httpStatus).sum(); - Instant end = Instant.now(); - System.out.println("END Iter " + sz + " " + Duration.between(start, end)); + while (reader.hasRemaining()) { + var next = reader.get(); + byte[] body = next.body(); + + st += Jsoup.parse(new ByteArrayInputStream(body), ContentType.parse(next.contentType()).asCharset().name(), next.url()).title().length(); + + } + System.out.println(Duration.between(start, Instant.now())); } - System.out.println("END Parquet"); + +// System.out.println("BEGIN Slop"); +// for (int i = 0; i < 6; i++) { +// int sz = 0; +// Instant start = Instant.now(); +// try (var reader = new SlopCrawlDataRecord.Reader(Path.of("/tmp/steam.slop.zip")) { +// public boolean filter(String url, int status, String contentType) { +// return contentType.startsWith("text/html"); +// } +// }) { +// while (reader.hasRemaining()) { +// sz += reader.get().httpStatus(); +// } +// } +// Instant end = Instant.now(); +// System.out.println("END Iter " + sz + " " + Duration.between(start, end)); +// } +// System.out.println("END Slop"); +// +// System.out.println("BEGIN Parquet"); +// for (int i = 0; i < 6; i++) { +// Instant start = Instant.now(); +// int sz = CrawledDocumentParquetRecordFileReader.stream(Path.of("/home/vlofgren/Downloads/_storage_crawl-data__23-10-21T15_08_43.750_3b_41_3b41e714-store.steampowered.com.parquet")) +// .filter(record -> record.contentType.startsWith("text/html")) +// .mapToInt(record -> record.httpStatus).sum(); +// Instant end = Instant.now(); +// System.out.println("END Iter " + sz + " " + Duration.between(start, end)); +// } +// System.out.println("END Parquet"); } } \ No newline at end of file diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index b2a0f2bc..3731fa0c 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -80,7 +80,7 @@ class WarcRecorderTest { recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, - "test", + "test".getBytes(), null, ContentTags.empty()); } @@ -116,7 +116,7 @@ class WarcRecorderTest { recorder.writeReferenceCopy(new EdgeUrl("https://www.marginalia.nu/"), "text/html", 200, - "test", + "test".getBytes(), null, ContentTags.empty()); } diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index ea9bcf60..edac6521 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -131,7 +131,7 @@ public class CrawlerMockFetcherTest { public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder recorder, ContentTags tags, ProbeType probeType) { logger.info("Fetching {}", url); if (mockData.containsKey(url)) { - byte[] bodyBytes = mockData.get(url).documentBody.getBytes(); + byte[] bodyBytes = mockData.get(url).documentBodyBytes; try { return new HttpFetchResult.ResultOk( diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java index 320171bb..f31d39eb 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/AtagExporter.java @@ -14,7 +14,6 @@ import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorageId; import org.apache.commons.lang3.StringUtils; -import org.jsoup.Jsoup; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -52,11 +51,9 @@ public class AtagExporter implements ExporterIf { try (var bw = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(Files.newOutputStream(tmpFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING))))) { - Path crawlerLogFile = inputDir.resolve("crawler.log"); - var tagWriter = new ATagCsvWriter(bw); - for (var item : WorkLog.iterable(crawlerLogFile)) { + for (var item : WorkLog.iterable(inputDir.resolve("crawler.log"))) { if (Thread.interrupted()) { throw new InterruptedException(); } @@ -89,15 +86,19 @@ public class AtagExporter implements ExporterIf { while (stream.hasNext()) { if (!(stream.next() instanceof CrawledDocument doc)) continue; - if (null == doc.documentBody) + if (!doc.hasBody()) continue; if (!doc.contentType.toLowerCase().startsWith("text/html")) continue; var baseUrl = new EdgeUrl(doc.url); - var parsed = Jsoup.parse(doc.documentBody); + var parsed = doc.parseBody(); for (var atag : parsed.getElementsByTag("a")) { + if (!atag.hasAttr("href")) { + continue; + } + String linkText = atag.text(); if (!linkFilter.isLinkTextEligible(linkText)) { diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java index abe5c708..2f5b1d90 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/FeedExporter.java @@ -12,7 +12,6 @@ import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorageId; -import org.jsoup.Jsoup; import java.io.BufferedWriter; import java.io.IOException; @@ -81,13 +80,13 @@ public class FeedExporter implements ExporterIf { while (stream.hasNext()) { if (!(stream.next() instanceof CrawledDocument doc)) continue; - if (null == doc.documentBody) + if (!doc.hasBody()) continue; if (!doc.contentType.toLowerCase().startsWith("text/html")) continue; var baseUrl = new EdgeUrl(doc.url); - var parsed = Jsoup.parse(doc.documentBody); + var parsed = doc.parseBody(); List feedUrls = new ArrayList<>(); for (var link : parsed.select("link[rel=alternate]")) { diff --git a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java index 3255edf2..08562251 100644 --- a/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/processes/export-task-process/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -15,7 +15,6 @@ import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.util.SimpleBlockingThreadPool; -import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -110,13 +109,13 @@ public class TermFrequencyExporter implements ExporterIf { return; if (!(stream.next() instanceof CrawledDocument doc)) continue; - if (doc.documentBody == null) continue; + if (!doc.hasBody()) continue; if (!doc.contentType.toLowerCase().startsWith("text/html")) continue; docCount.incrementAndGet(); - Document parsed = Jsoup.parse(doc.documentBody); + Document parsed = doc.parseBody(); parsed.body().filter(new DomPruningFilter(0.5)); DocumentLanguageData dld = se.extractSentences(parsed); diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java index 4324c254..4e94bb52 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java @@ -119,12 +119,16 @@ public class LiveCrawlDataSet implements AutoCloseable { } } - private String decompress(byte[] data) { + private String decompressStr(byte[] data) { + return new String(decompressBytes(data)); + } + + private byte[] decompressBytes(byte[] data) { // gzip decompression - try (var bis = new ByteArrayInputStream(data); - var gzip = new GZIPInputStream(bis)) + try (ByteArrayInputStream bis = new ByteArrayInputStream(data); + GZIPInputStream gzip = new GZIPInputStream(bis)) { - return new String(gzip.readAllBytes()); + return gzip.readAllBytes(); } catch (IOException ex) { throw new RuntimeException(ex); @@ -177,8 +181,8 @@ public class LiveCrawlDataSet implements AutoCloseable { dataStack = new ArrayList<>(); while (rs.next()) { String url = rs.getString("url"); - String body = decompress(rs.getBytes("body")); - String headers = decompress(rs.getBytes("headers")); + byte[] body = decompressBytes(rs.getBytes("body")); + String headers = decompressStr(rs.getBytes("headers")); dataStack.add(new CrawledDocument( "LIVE", diff --git a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java index f8af9267..9bb5f8bc 100644 --- a/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java +++ b/code/processes/live-crawling-process/java/nu/marginalia/livecrawler/LiveCrawlerMain.java @@ -197,7 +197,7 @@ public class LiveCrawlerMain extends ProcessMainClass { writer.setOrdinalOffset(67_000_000); for (SerializableCrawlDataStream stream : hb.wrap("Processing", dataSet.getDataStreams())) { - writer.write(domainProcessor.sideloadProcessing(stream, 0, Set.of("special:live"))); + writer.write(domainProcessor.simpleProcessing(stream, 0, Set.of("special:live"))); } } diff --git a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java index ee1124e9..f45675e1 100644 --- a/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java +++ b/code/processes/live-crawling-process/test/nu/marginalia/livecrawler/LiveCrawlDataSetTest.java @@ -51,7 +51,7 @@ public class LiveCrawlDataSetTest { case CrawledDocument document -> { dataCount++; Assertions.assertEquals("https://www.example.com/", document.url); - Assertions.assertEquals("test", document.documentBody); + Assertions.assertEquals("test", document.documentBody()); } } } diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java index 5599d6af..84538679 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -6,7 +6,6 @@ import nu.marginalia.converting.ConverterModule; import nu.marginalia.io.CrawledDomainReader; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; -import nu.marginalia.tools.experiments.*; import java.io.IOException; import java.nio.file.Path; @@ -16,13 +15,6 @@ import java.util.Map; public class ExperimentRunnerMain { private static Map> experiments = Map.of( - "test", TestExperiment.class, - "adblock", AdblockExperiment.class, - "topic", TopicExperiment.class, - "sentence-statistics", SentenceStatisticsExperiment.class, - "site-statistics", SiteStatisticsExperiment.class, - "export-atags", ExportExternalLinksExperiment.class, - "debug-converter", DebugConverterExperiment.class ); public static void main(String... args) throws IOException { diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java deleted file mode 100644 index dc46f3bd..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java +++ /dev/null @@ -1,45 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator; -import nu.marginalia.model.crawldata.CrawledDocument; -import nu.marginalia.model.crawldata.CrawledDomain; -import nu.marginalia.tools.LegacyExperiment; -import org.jsoup.Jsoup; -import org.jsoup.nodes.Document; - -public class AdblockExperiment extends LegacyExperiment { - - private final AdblockSimulator simulator; - - @Inject - public AdblockExperiment(AdblockSimulator simulator) { - this.simulator = simulator; - } - - @Override - public boolean process(CrawledDomain domain) { - if (domain.doc == null) return true; - - for (var doc : domain.doc) { - if (DocumentProcessor.isAcceptedContentType(doc) && "OK".equals(doc.crawlerStatus)) { - processDocument(doc); - } - } - - return true; - } - - private void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody); - - if (simulator.hasAds(parsedDocument)) { - System.out.println(doc.url); - } - } - - @Override - public void onFinish() { - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java deleted file mode 100644 index 4a34a31c..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization; -import nu.marginalia.model.crawldata.CrawledDomain; -import nu.marginalia.tools.LegacyExperiment; -import org.jsoup.Jsoup; - -public class DebugConverterExperiment extends LegacyExperiment { - - - private final DomainProcessor domainProcessor; - - @Inject - public DebugConverterExperiment(DomainProcessor domainProcessor) { - this.domainProcessor = domainProcessor; - - } - - @Override - public boolean process(CrawledDomain domain) { - - if (domain.doc == null) return true; - - for (var doc : domain.doc) { - if (doc.documentBody == null) continue; - - var parsed = Jsoup.parse(doc.documentBody); - - var tagExtractor = new BlogSpecialization.BlogTagExtractor(); - parsed.traverse(tagExtractor); - var tags = tagExtractor.getTags(); - if (!tags.isEmpty()) { - System.out.println(tags); - } - - } - - return true; - } - - @Override - public void onFinish() { - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java deleted file mode 100644 index 8a8b092e..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java +++ /dev/null @@ -1,71 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import gnu.trove.set.hash.TLongHashSet; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.io.SerializableCrawlDataStream; -import nu.marginalia.link_parser.LinkParser; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawldata.CrawledDocument; -import nu.marginalia.tools.Experiment; -import org.jsoup.Jsoup; - -import java.util.Objects; - -public class ExportExternalLinksExperiment extends Experiment { - - @Inject - public ExportExternalLinksExperiment() { - - } - private static final LinkParser linkParser = new LinkParser(); - MurmurHash3_128 hash = new MurmurHash3_128(); - - @Override - public boolean process(SerializableCrawlDataStream stream) { - TLongHashSet hashes = new TLongHashSet(); - - try { - while (stream.hasNext()) { - if (!(stream.next() instanceof CrawledDocument doc)) - continue; - if (null == doc.documentBody) - continue; - - var baseUrl = new EdgeUrl(doc.url); - var parsed = Jsoup.parse(doc.documentBody); - - for (var atag : parsed.getElementsByTag("a")) { - String linkText = atag.text(); - if (linkText.isBlank()) - continue; - - var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); - linkOpt - .filter(url -> !Objects.equals(url.domain, baseUrl.domain)) - .filter(url -> hashes.add(hash.hashNearlyASCII(linkText) ^ hash.hashNearlyASCII(url.toString()))) - .ifPresent(url -> - System.out.printf("\"%s\",\"%s\",\"%s\"\n", - csvify(url), - csvify(baseUrl.domain), - csvify(linkText))); - } - } - - } - catch (Exception ex) { - ex.printStackTrace(); - return false; - } - - return true; - } - - private static String csvify(Object field) { - return field.toString().replace("\"", "\"\""); - } - - @Override - public void onFinish() { - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java deleted file mode 100644 index ba247479..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ /dev/null @@ -1,70 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.keyword.LinkTexts; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawldata.CrawledDomain; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import nu.marginalia.tools.LegacyExperiment; -import org.jsoup.Jsoup; - -import java.io.BufferedOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; - -public class SentenceStatisticsExperiment extends LegacyExperiment { - - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); - Path filename; - PrintWriter writer; - - @Inject - public SentenceStatisticsExperiment() throws IOException { - filename = Files.createTempFile(getClass().getSimpleName(), ".csv"); - System.out.println("Writing to " + filename); - - writer = new PrintWriter(new BufferedOutputStream(new FileOutputStream(filename.toFile()))); - } - - private void logLine(String message) { - System.out.printf("\u001b[2K\r%s", message); - } - - @Override - public boolean process(CrawledDomain domain) { - if (domain.doc == null) return true; - - logLine("Processing: " + domain.domain); - - ByteBuffer workArea = ByteBuffer.allocate(8192); - for (var doc : domain.doc) { - if (doc.documentBody == null) continue; - - var parsed = Jsoup.parse(doc.documentBody); - - parsed.body().filter(new DomPruningFilter(0.5)); - - var dld = se.extractSentences(parsed); - var keywords = documentKeywordExtractor.extractKeywords(dld, new LinkTexts(), EdgeUrl.parse(doc.url).orElseThrow()); - - keywords.build(workArea); - } - - return true; - } - - @Override - public void onFinish() { - logLine("Done!\n"); - writer.close(); - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java deleted file mode 100644 index b484b50c..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import nu.marginalia.converting.model.ProcessedDocument; -import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.io.SerializableCrawlDataStream; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.tools.Experiment; - -import java.util.Comparator; - -public class SiteStatisticsExperiment extends Experiment { - - - private final DomainProcessor domainProcessor; - - @Inject - public SiteStatisticsExperiment(DomainProcessor domainProcessor) { - this.domainProcessor = domainProcessor; - - } - - @Override - public boolean process(SerializableCrawlDataStream stream) { - var ret = domainProcessor.fullProcessing(stream); - - ret.documents.stream() - .filter(ProcessedDocument::isProcessedFully) - .sorted(Comparator.comparing(doc -> doc.details.metadata.topology())) - .flatMap(doc -> doc.details.feedLinks.stream()) - .map(EdgeUrl::toString) - .min(Comparator.comparing(String::length)) - .ifPresent(url -> { - System.out.printf("\"%s\",\"%s\"\n", ret.domain, url); - }); - - return true; - } - - @Override - public void onFinish() { - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java deleted file mode 100644 index 436b227d..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java +++ /dev/null @@ -1,16 +0,0 @@ -package nu.marginalia.tools.experiments; - -import nu.marginalia.model.crawldata.CrawledDomain; -import nu.marginalia.tools.LegacyExperiment; - -public class TestExperiment extends LegacyExperiment { - @Override - public boolean process(CrawledDomain domain) { - return true; - } - - @Override - public void onFinish() { - System.out.println("Tada!"); - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java deleted file mode 100644 index b617a3dc..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ /dev/null @@ -1,64 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.classifier.topic.AdHocDetector; -import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.model.crawldata.CrawledDomain; -import nu.marginalia.tools.LegacyExperiment; -import org.jsoup.Jsoup; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public class TopicExperiment extends LegacyExperiment { - - AdHocDetector detector; - - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - Path filename = null; - - public void args(String... args) { - filename = Path.of(args[0]); - try { - detector = new AdHocDetector(Files.readAllLines(filename)); - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - @Inject - public TopicExperiment() { - } - - @Override - public boolean process(CrawledDomain domain) { - if (domain.doc == null) return true; - - - for (var doc : domain.doc) { - if (doc.documentBody == null) continue; - - var parsed = Jsoup.parse(doc.documentBody); - - parsed.body().filter(new DomPruningFilter(0.5)); - var dld = se.extractSentences(parsed); - - if (dld.totalNumWords() < 50) - continue; - - if (detector.testP(dld) > 0.5) { - System.out.println("match\t" + doc.url); - } - - } - - return true; - } - - @Override - public void onFinish() { - } -} diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index 498daa79..07cc8239 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -134,7 +134,7 @@ public class IntegrationTest {

- """, + """.getBytes(), "", ContentTags.empty() ); diff --git a/settings.gradle b/settings.gradle index ff5b7126..5ff120ca 100644 --- a/settings.gradle +++ b/settings.gradle @@ -230,7 +230,7 @@ dependencyResolutionManagement { library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208') library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208') - library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-3-SNAPSHOT') + library('slop', 'nu.marginalia', 'slop').version('0.0.9-org-4-SNAPSHOT') bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet'])