From ed728b2680e233318ed005e25e8cf8106a84ffd0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 30 Jan 2023 09:33:04 +0100 Subject: [PATCH] Compressed string component --- .../src/main/java/nu/marginalia/wmsa/client/GsonFactory.java | 3 +++ .../wmsa/edge/converting/LinkKeywordExtractorMain.java | 2 +- .../marginalia/wmsa/edge/crawling/model/CrawledDocument.java | 5 +++-- .../wmsa/edge/crawling/retreival/CrawlerRetreiver.java | 4 ++-- .../marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java | 5 +++-- .../nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java | 2 +- .../marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java | 2 +- 7 files changed, 14 insertions(+), 9 deletions(-) diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java index c0af0c12..393b2ea5 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/client/GsonFactory.java @@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client; import com.google.gson.*; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; +import nu.marginalia.util.bigstring.BigString; import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.id.EdgeId; @@ -24,6 +25,8 @@ public class GsonFactory { .registerTypeAdapter(EdgeDomain.class, (JsonDeserializer) (json, typeOfT, context) -> new EdgeDomain(json.getAsString())) .registerTypeAdapter(EdgeId.class, (JsonDeserializer>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt())) .registerTypeAdapter(EdgeId.class, (JsonSerializer>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id())) + .registerTypeAdapter(BigString.class, (JsonDeserializer) (json, typeOfT, context) -> BigString.encode(json.getAsString())) + .registerTypeAdapter(BigString.class, (JsonSerializer) (src, typeOfT, context) -> new JsonPrimitive(src.decode())) .serializeSpecialFloatingPointValues() .create(); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java index f8de6c0c..f9557c97 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/converting/LinkKeywordExtractorMain.java @@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain { for (var doc : crawledDomain.doc) { if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { - anchorTextExtractor.processDocument(doc.url, doc.documentBody); + anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode()); } } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java index d43315a0..497f323f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/model/CrawledDocument.java @@ -1,6 +1,8 @@ package nu.marginalia.wmsa.edge.crawling.model; import lombok.Builder; +import nu.marginalia.util.bigstring.BigString; +import nu.marginalia.util.bigstring.CompressedBigString; @Builder public class CrawledDocument implements SerializableCrawlData { @@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData { public String crawlerStatusDesc; public String headers; - public String documentBody; - + public BigString documentBody; public String documentBodyHash; public String canonicalUrl; diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java index 5e60ec3a..f950e831 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/CrawlerRetreiver.java @@ -206,7 +206,7 @@ public class CrawlerRetreiver { if (doc.documentBody != null) { - doc.documentBodyHash = createHash(doc.documentBody); + doc.documentBodyHash = createHash(doc.documentBody.decode()); Optional parsedDoc = parseDoc(doc); EdgeUrl url = new EdgeUrl(doc.url); @@ -251,7 +251,7 @@ public class CrawlerRetreiver { private Optional parseDoc(CrawledDocument doc) { if (doc.documentBody == null) return Optional.empty(); - return Optional.of(Jsoup.parse(doc.documentBody)); + return Optional.of(Jsoup.parse(doc.documentBody.decode())); } public boolean isSameDomain(EdgeUrl url) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java index d215d66e..4532156f 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/crawling/retreival/HttpFetcher.java @@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser; import lombok.AllArgsConstructor; import lombok.SneakyThrows; import lombok.ToString; +import nu.marginalia.util.bigstring.BigString; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; @@ -271,7 +272,7 @@ public class HttpFetcher { .canonicalUrl(canonical) .httpStatus(rsp.code()) .url(responseUrl.toString()) - .documentBody(strData) + .documentBody(BigString.encode(strData)) .build(); } @@ -325,7 +326,7 @@ public class HttpFetcher { private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { return robotsParser.parseContent(doc.url, - doc.documentBody.getBytes(StandardCharsets.UTF_8), + doc.documentBody.getBytes(), doc.contentType, userAgent); } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java index bd3c0429..b97fc27b 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/AdblockTesterTool.java @@ -48,7 +48,7 @@ public class AdblockTesterTool { private static void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody); + Document parsedDocument = Jsoup.parse(doc.documentBody.decode()); if (simulator.hasAds(parsedDocument)) { System.out.println(doc.url); diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java index b0e86d7f..cbe59e60 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/CrawlDataExtractorTool.java @@ -84,7 +84,7 @@ public class CrawlDataExtractorTool { private static void processDocument(CrawledDocument doc) { - Document parsedDocument = Jsoup.parse(doc.documentBody); + Document parsedDocument = Jsoup.parse(doc.documentBody.decode()); if (abs.hasAds(parsedDocument)) { System.out.println(doc.url);