Compressed string component

This commit is contained in:
Viktor Lofgren 2023-01-30 09:33:04 +01:00
parent 728931c135
commit ed728b2680
7 changed files with 14 additions and 9 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
import com.google.gson.*; import com.google.gson.*;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory; import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.wmsa.edge.model.EdgeDomain; import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl; import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId; import nu.marginalia.wmsa.edge.model.id.EdgeId;
@ -24,6 +25,8 @@ public class GsonFactory {
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString())) .registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt())) .registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id())) .registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.serializeSpecialFloatingPointValues() .serializeSpecialFloatingPointValues()
.create(); .create();
} }

View File

@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain {
for (var doc : crawledDomain.doc) { for (var doc : crawledDomain.doc) {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) { if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
anchorTextExtractor.processDocument(doc.url, doc.documentBody); anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode());
} }
} }
} }

View File

@ -1,6 +1,8 @@
package nu.marginalia.wmsa.edge.crawling.model; package nu.marginalia.wmsa.edge.crawling.model;
import lombok.Builder; import lombok.Builder;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.util.bigstring.CompressedBigString;
@Builder @Builder
public class CrawledDocument implements SerializableCrawlData { public class CrawledDocument implements SerializableCrawlData {
@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatusDesc; public String crawlerStatusDesc;
public String headers; public String headers;
public String documentBody; public BigString documentBody;
public String documentBodyHash; public String documentBodyHash;
public String canonicalUrl; public String canonicalUrl;

View File

@ -206,7 +206,7 @@ public class CrawlerRetreiver {
if (doc.documentBody != null) { if (doc.documentBody != null) {
doc.documentBodyHash = createHash(doc.documentBody); doc.documentBodyHash = createHash(doc.documentBody.decode());
Optional<Document> parsedDoc = parseDoc(doc); Optional<Document> parsedDoc = parseDoc(doc);
EdgeUrl url = new EdgeUrl(doc.url); EdgeUrl url = new EdgeUrl(doc.url);
@ -251,7 +251,7 @@ public class CrawlerRetreiver {
private Optional<Document> parseDoc(CrawledDocument doc) { private Optional<Document> parseDoc(CrawledDocument doc) {
if (doc.documentBody == null) if (doc.documentBody == null)
return Optional.empty(); return Optional.empty();
return Optional.of(Jsoup.parse(doc.documentBody)); return Optional.of(Jsoup.parse(doc.documentBody.decode()));
} }
public boolean isSameDomain(EdgeUrl url) { public boolean isSameDomain(EdgeUrl url) {

View File

@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.AllArgsConstructor; import lombok.AllArgsConstructor;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import lombok.ToString; import lombok.ToString;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument; import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus; import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic; import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
@ -271,7 +272,7 @@ public class HttpFetcher {
.canonicalUrl(canonical) .canonicalUrl(canonical)
.httpStatus(rsp.code()) .httpStatus(rsp.code())
.url(responseUrl.toString()) .url(responseUrl.toString())
.documentBody(strData) .documentBody(BigString.encode(strData))
.build(); .build();
} }
@ -325,7 +326,7 @@ public class HttpFetcher {
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) { private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
return robotsParser.parseContent(doc.url, return robotsParser.parseContent(doc.url,
doc.documentBody.getBytes(StandardCharsets.UTF_8), doc.documentBody.getBytes(),
doc.contentType, doc.contentType,
userAgent); userAgent);
} }

View File

@ -48,7 +48,7 @@ public class AdblockTesterTool {
private static void processDocument(CrawledDocument doc) { private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody); Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (simulator.hasAds(parsedDocument)) { if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url); System.out.println(doc.url);

View File

@ -84,7 +84,7 @@ public class CrawlDataExtractorTool {
private static void processDocument(CrawledDocument doc) { private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody); Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (abs.hasAds(parsedDocument)) { if (abs.hasAds(parsedDocument)) {
System.out.println(doc.url); System.out.println(doc.url);