Compressed string component

This commit is contained in:
Viktor Lofgren 2023-01-30 09:33:04 +01:00
parent 728931c135
commit ed728b2680
7 changed files with 14 additions and 9 deletions

View File

@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
import com.google.gson.*;
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.wmsa.edge.model.EdgeDomain;
import nu.marginalia.wmsa.edge.model.EdgeUrl;
import nu.marginalia.wmsa.edge.model.id.EdgeId;
@ -24,6 +25,8 @@ public class GsonFactory {
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
.serializeSpecialFloatingPointValues()
.create();
}

View File

@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain {
for (var doc : crawledDomain.doc) {
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode());
}
}
}

View File

@ -1,6 +1,8 @@
package nu.marginalia.wmsa.edge.crawling.model;
import lombok.Builder;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.util.bigstring.CompressedBigString;
@Builder
public class CrawledDocument implements SerializableCrawlData {
@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData {
public String crawlerStatusDesc;
public String headers;
public String documentBody;
public BigString documentBody;
public String documentBodyHash;
public String canonicalUrl;

View File

@ -206,7 +206,7 @@ public class CrawlerRetreiver {
if (doc.documentBody != null) {
doc.documentBodyHash = createHash(doc.documentBody);
doc.documentBodyHash = createHash(doc.documentBody.decode());
Optional<Document> parsedDoc = parseDoc(doc);
EdgeUrl url = new EdgeUrl(doc.url);
@ -251,7 +251,7 @@ public class CrawlerRetreiver {
private Optional<Document> parseDoc(CrawledDocument doc) {
if (doc.documentBody == null)
return Optional.empty();
return Optional.of(Jsoup.parse(doc.documentBody));
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
}
public boolean isSameDomain(EdgeUrl url) {

View File

@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.ToString;
import nu.marginalia.util.bigstring.BigString;
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
@ -271,7 +272,7 @@ public class HttpFetcher {
.canonicalUrl(canonical)
.httpStatus(rsp.code())
.url(responseUrl.toString())
.documentBody(strData)
.documentBody(BigString.encode(strData))
.build();
}
@ -325,7 +326,7 @@ public class HttpFetcher {
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
return robotsParser.parseContent(doc.url,
doc.documentBody.getBytes(StandardCharsets.UTF_8),
doc.documentBody.getBytes(),
doc.contentType,
userAgent);
}

View File

@ -48,7 +48,7 @@ public class AdblockTesterTool {
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody);
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (simulator.hasAds(parsedDocument)) {
System.out.println(doc.url);

View File

@ -84,7 +84,7 @@ public class CrawlDataExtractorTool {
private static void processDocument(CrawledDocument doc) {
Document parsedDocument = Jsoup.parse(doc.documentBody);
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
if (abs.hasAds(parsedDocument)) {
System.out.println(doc.url);