mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 21:18:58 +00:00
Compressed string component
This commit is contained in:
parent
728931c135
commit
ed728b2680
@ -2,6 +2,7 @@ package nu.marginalia.wmsa.client;
|
||||
|
||||
import com.google.gson.*;
|
||||
import marcono1234.gson.recordadapter.RecordTypeAdapterFactory;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeDomain;
|
||||
import nu.marginalia.wmsa.edge.model.EdgeUrl;
|
||||
import nu.marginalia.wmsa.edge.model.id.EdgeId;
|
||||
@ -24,6 +25,8 @@ public class GsonFactory {
|
||||
.registerTypeAdapter(EdgeDomain.class, (JsonDeserializer<EdgeDomain>) (json, typeOfT, context) -> new EdgeDomain(json.getAsString()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonDeserializer<EdgeId<?>>) (json, typeOfT, context) -> new EdgeId<>(json.getAsInt()))
|
||||
.registerTypeAdapter(EdgeId.class, (JsonSerializer<EdgeId<?>>) (src, typeOfSrc, context) -> new JsonPrimitive(src.id()))
|
||||
.registerTypeAdapter(BigString.class, (JsonDeserializer<BigString>) (json, typeOfT, context) -> BigString.encode(json.getAsString()))
|
||||
.registerTypeAdapter(BigString.class, (JsonSerializer<BigString>) (src, typeOfT, context) -> new JsonPrimitive(src.decode()))
|
||||
.serializeSpecialFloatingPointValues()
|
||||
.create();
|
||||
}
|
||||
|
@ -150,7 +150,7 @@ public class LinkKeywordExtractorMain {
|
||||
|
||||
for (var doc : crawledDomain.doc) {
|
||||
if (Objects.equals(doc.crawlerStatus, CrawlerDocumentStatus.OK.name())) {
|
||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody);
|
||||
anchorTextExtractor.processDocument(doc.url, doc.documentBody.decode());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.wmsa.edge.crawling.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.util.bigstring.CompressedBigString;
|
||||
|
||||
@Builder
|
||||
public class CrawledDocument implements SerializableCrawlData {
|
||||
@ -16,8 +18,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
||||
public String crawlerStatusDesc;
|
||||
|
||||
public String headers;
|
||||
public String documentBody;
|
||||
|
||||
public BigString documentBody;
|
||||
public String documentBodyHash;
|
||||
|
||||
public String canonicalUrl;
|
||||
|
@ -206,7 +206,7 @@ public class CrawlerRetreiver {
|
||||
|
||||
if (doc.documentBody != null) {
|
||||
|
||||
doc.documentBodyHash = createHash(doc.documentBody);
|
||||
doc.documentBodyHash = createHash(doc.documentBody.decode());
|
||||
|
||||
Optional<Document> parsedDoc = parseDoc(doc);
|
||||
EdgeUrl url = new EdgeUrl(doc.url);
|
||||
@ -251,7 +251,7 @@ public class CrawlerRetreiver {
|
||||
private Optional<Document> parseDoc(CrawledDocument doc) {
|
||||
if (doc.documentBody == null)
|
||||
return Optional.empty();
|
||||
return Optional.of(Jsoup.parse(doc.documentBody));
|
||||
return Optional.of(Jsoup.parse(doc.documentBody.decode()));
|
||||
}
|
||||
|
||||
public boolean isSameDomain(EdgeUrl url) {
|
||||
|
@ -7,6 +7,7 @@ import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.util.bigstring.BigString;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawledDocument;
|
||||
import nu.marginalia.wmsa.edge.crawling.model.CrawlerDocumentStatus;
|
||||
import nu.marginalia.wmsa.edge.crawling.retreival.logic.ContentTypeLogic;
|
||||
@ -271,7 +272,7 @@ public class HttpFetcher {
|
||||
.canonicalUrl(canonical)
|
||||
.httpStatus(rsp.code())
|
||||
.url(responseUrl.toString())
|
||||
.documentBody(strData)
|
||||
.documentBody(BigString.encode(strData))
|
||||
.build();
|
||||
}
|
||||
|
||||
@ -325,7 +326,7 @@ public class HttpFetcher {
|
||||
|
||||
private SimpleRobotRules parseRobotsTxt(CrawledDocument doc) {
|
||||
return robotsParser.parseContent(doc.url,
|
||||
doc.documentBody.getBytes(StandardCharsets.UTF_8),
|
||||
doc.documentBody.getBytes(),
|
||||
doc.contentType,
|
||||
userAgent);
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ public class AdblockTesterTool {
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
if (simulator.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
|
@ -84,7 +84,7 @@ public class CrawlDataExtractorTool {
|
||||
|
||||
|
||||
private static void processDocument(CrawledDocument doc) {
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody);
|
||||
Document parsedDocument = Jsoup.parse(doc.documentBody.decode());
|
||||
|
||||
if (abs.hasAds(parsedDocument)) {
|
||||
System.out.println(doc.url);
|
||||
|
Loading…
Reference in New Issue
Block a user