From 79ce4de2ab0667bcd0808e195049cdcf7b4c00c0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 20 Nov 2024 15:27:05 +0100 Subject: [PATCH] (model) Remove deprecated fields from CrawledDocument and CrawledDomain --- .../sideload/SideloaderProcessing.java | 4 - .../converting/ConvertingIntegrationTest.java | 4 - .../ParquetSerializableCrawlDataStream.java | 5 +- .../model/crawldata/CrawledDocument.java | 44 +----- .../model/crawldata/CrawledDomain.java | 132 +++--------------- .../crawldata/SerializableCrawlData.java | 2 +- .../livecrawler/LiveCrawlDataSet.java | 4 - 7 files changed, 23 insertions(+), 172 deletions(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index b7cf244b..95729851 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -51,10 +51,6 @@ public class SideloaderProcessing { "NP", "", body, - Integer.toHexString(url.hashCode()), - url, - "", - "SIDELOAD", false, null, null diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java index 728d57ca..7cc451b4 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -147,10 +147,6 @@ public class ConvertingIntegrationTest { "", "", readClassPathFile(p.toString()), - Double.toString(Math.random()), - "https://memex.marginalia.nu/" + file, - null, - "", false, null, null diff --git a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index 11c08267..ac22afe2 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -144,10 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial "", nextRecord.headers, bodyString, - Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? - nextRecord.url, - null, - "", + // this field isn't actually used, maybe we can skip calculating it? nextRecord.cookies, lastModified, etag)); diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index 9d01d551..656e4b0f 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -4,7 +4,7 @@ import nu.marginalia.model.EdgeUrl; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; -public class CrawledDocument implements SerializableCrawlData { +public final class CrawledDocument implements SerializableCrawlData { public String crawlId; public String url; @@ -21,16 +21,6 @@ public class CrawledDocument implements SerializableCrawlData { public String documentBody; - @Deprecated - public String documentBodyHash; - - @Deprecated - public String canonicalUrl; - public String redirectUrl; - - @Deprecated - public String recrawlState; - /** * This is not guaranteed to be set in all versions of the format, * information may come in CrawledDomain instead @@ -40,7 +30,7 @@ public class CrawledDocument implements SerializableCrawlData { public String lastModifiedMaybe; public String etagMaybe; - public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, String documentBodyHash, String canonicalUrl, String redirectUrl, String recrawlState, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { + public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { this.crawlId = crawlId; this.url = url; this.contentType = contentType; @@ -50,10 +40,6 @@ public class CrawledDocument implements SerializableCrawlData { this.crawlerStatusDesc = crawlerStatusDesc; this.headers = headers; this.documentBody = documentBody; - this.documentBodyHash = documentBodyHash; - this.canonicalUrl = canonicalUrl; - this.redirectUrl = redirectUrl; - this.recrawlState = recrawlState; this.hasCookies = hasCookies; this.lastModifiedMaybe = lastModifiedMaybe; this.etagMaybe = etagMaybe; @@ -120,7 +106,7 @@ public class CrawledDocument implements SerializableCrawlData { } public String toString() { - return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; + return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; } public static class CrawledDocumentBuilder { @@ -133,9 +119,6 @@ public class CrawledDocument implements SerializableCrawlData { private String crawlerStatusDesc; private @Nullable String headers; private String documentBody; - private String documentBodyHash; - private String canonicalUrl; - private String redirectUrl; private String recrawlState; private Boolean hasCookies; private String lastModifiedMaybe; @@ -189,23 +172,6 @@ public class CrawledDocument implements SerializableCrawlData { return this; } - @Deprecated - public CrawledDocumentBuilder documentBodyHash(String documentBodyHash) { - this.documentBodyHash = documentBodyHash; - return this; - } - - @Deprecated - public CrawledDocumentBuilder canonicalUrl(String canonicalUrl) { - this.canonicalUrl = canonicalUrl; - return this; - } - - public CrawledDocumentBuilder redirectUrl(String redirectUrl) { - this.redirectUrl = redirectUrl; - return this; - } - @Deprecated public CrawledDocumentBuilder recrawlState(String recrawlState) { this.recrawlState = recrawlState; @@ -228,11 +194,11 @@ public class CrawledDocument implements SerializableCrawlData { } public CrawledDocument build() { - return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.documentBodyHash, this.canonicalUrl, this.redirectUrl, this.recrawlState, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); + return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); } public String toString() { - return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; + return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; } } } diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java index c3005aee..33addc99 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java @@ -1,8 +1,9 @@ package nu.marginalia.model.crawldata; import java.util.List; +import java.util.Objects; -public class CrawledDomain implements SerializableCrawlData { +public final class CrawledDomain implements SerializableCrawlData { public String domain; public String redirectDomain; @@ -11,6 +12,7 @@ public class CrawledDomain implements SerializableCrawlData { public String crawlerStatusDesc; public String ip; + @Deprecated // This used to be populated, but is no longer public List doc; /** @@ -29,15 +31,6 @@ public class CrawledDomain implements SerializableCrawlData { this.cookies = cookies; } - public static CrawledDomainBuilder builder() { - return new CrawledDomainBuilder(); - } - - public int size() { - if (doc == null) return 0; - return doc.size(); - } - public String getDomain() { return this.domain; } @@ -94,119 +87,26 @@ public class CrawledDomain implements SerializableCrawlData { this.cookies = cookies; } - public boolean equals(final Object o) { - if (o == this) return true; - if (!(o instanceof CrawledDomain)) return false; - final CrawledDomain other = (CrawledDomain) o; - if (!other.canEqual((Object) this)) return false; - final Object this$domain = this.getDomain(); - final Object other$domain = other.getDomain(); - if (this$domain == null ? other$domain != null : !this$domain.equals(other$domain)) return false; - final Object this$redirectDomain = this.getRedirectDomain(); - final Object other$redirectDomain = other.getRedirectDomain(); - if (this$redirectDomain == null ? other$redirectDomain != null : !this$redirectDomain.equals(other$redirectDomain)) - return false; - final Object this$crawlerStatus = this.getCrawlerStatus(); - final Object other$crawlerStatus = other.getCrawlerStatus(); - if (this$crawlerStatus == null ? other$crawlerStatus != null : !this$crawlerStatus.equals(other$crawlerStatus)) - return false; - final Object this$crawlerStatusDesc = this.getCrawlerStatusDesc(); - final Object other$crawlerStatusDesc = other.getCrawlerStatusDesc(); - if (this$crawlerStatusDesc == null ? other$crawlerStatusDesc != null : !this$crawlerStatusDesc.equals(other$crawlerStatusDesc)) - return false; - final Object this$ip = this.getIp(); - final Object other$ip = other.getIp(); - if (this$ip == null ? other$ip != null : !this$ip.equals(other$ip)) return false; - final Object this$doc = this.getDoc(); - final Object other$doc = other.getDoc(); - if (this$doc == null ? other$doc != null : !this$doc.equals(other$doc)) return false; - final Object this$cookies = this.getCookies(); - final Object other$cookies = other.getCookies(); - if (this$cookies == null ? other$cookies != null : !this$cookies.equals(other$cookies)) return false; - return true; - } - - protected boolean canEqual(final Object other) { - return other instanceof CrawledDomain; + @Override + public boolean equals(Object o) { + if (!(o instanceof CrawledDomain that)) return false; + + return Objects.equals(domain, that.domain) && Objects.equals(redirectDomain, that.redirectDomain) && Objects.equals(crawlerStatus, that.crawlerStatus) && Objects.equals(crawlerStatusDesc, that.crawlerStatusDesc) && Objects.equals(ip, that.ip) && Objects.equals(doc, that.doc) && Objects.equals(cookies, that.cookies); } + @Override public int hashCode() { - final int PRIME = 59; - int result = 1; - final Object $domain = this.getDomain(); - result = result * PRIME + ($domain == null ? 43 : $domain.hashCode()); - final Object $redirectDomain = this.getRedirectDomain(); - result = result * PRIME + ($redirectDomain == null ? 43 : $redirectDomain.hashCode()); - final Object $crawlerStatus = this.getCrawlerStatus(); - result = result * PRIME + ($crawlerStatus == null ? 43 : $crawlerStatus.hashCode()); - final Object $crawlerStatusDesc = this.getCrawlerStatusDesc(); - result = result * PRIME + ($crawlerStatusDesc == null ? 43 : $crawlerStatusDesc.hashCode()); - final Object $ip = this.getIp(); - result = result * PRIME + ($ip == null ? 43 : $ip.hashCode()); - final Object $doc = this.getDoc(); - result = result * PRIME + ($doc == null ? 43 : $doc.hashCode()); - final Object $cookies = this.getCookies(); - result = result * PRIME + ($cookies == null ? 43 : $cookies.hashCode()); + int result = Objects.hashCode(domain); + result = 31 * result + Objects.hashCode(redirectDomain); + result = 31 * result + Objects.hashCode(crawlerStatus); + result = 31 * result + Objects.hashCode(crawlerStatusDesc); + result = 31 * result + Objects.hashCode(ip); + result = 31 * result + Objects.hashCode(doc); + result = 31 * result + Objects.hashCode(cookies); return result; } public String toString() { return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")"; } - - public static class CrawledDomainBuilder { - private String domain; - private String redirectDomain; - private String crawlerStatus; - private String crawlerStatusDesc; - private String ip; - private List doc; - private List cookies; - - CrawledDomainBuilder() { - } - - public CrawledDomainBuilder domain(String domain) { - this.domain = domain; - return this; - } - - public CrawledDomainBuilder redirectDomain(String redirectDomain) { - this.redirectDomain = redirectDomain; - return this; - } - - public CrawledDomainBuilder crawlerStatus(String crawlerStatus) { - this.crawlerStatus = crawlerStatus; - return this; - } - - public CrawledDomainBuilder crawlerStatusDesc(String crawlerStatusDesc) { - this.crawlerStatusDesc = crawlerStatusDesc; - return this; - } - - public CrawledDomainBuilder ip(String ip) { - this.ip = ip; - return this; - } - - public CrawledDomainBuilder doc(List doc) { - this.doc = doc; - return this; - } - - public CrawledDomainBuilder cookies(List cookies) { - this.cookies = cookies; - return this; - } - - public CrawledDomain build() { - return new CrawledDomain(this.domain, this.redirectDomain, this.crawlerStatus, this.crawlerStatusDesc, this.ip, this.doc, this.cookies); - } - - public String toString() { - return "CrawledDomain.CrawledDomainBuilder(domain=" + this.domain + ", redirectDomain=" + this.redirectDomain + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", ip=" + this.ip + ", doc=" + this.doc + ", cookies=" + this.cookies + ")"; - } - } } diff --git a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java index 58d25dea..dfb26597 100644 --- a/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java @@ -1,5 +1,5 @@ package nu.marginalia.model.crawldata; -public interface SerializableCrawlData { +public sealed interface SerializableCrawlData permits CrawledDocument, CrawledDomain { String getDomain(); } diff --git a/code/processes/live-crawler/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java b/code/processes/live-crawler/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java index 1fbf561b..7b4f981e 100644 --- a/code/processes/live-crawler/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java +++ b/code/processes/live-crawler/java/nu/marginalia/livecrawler/LiveCrawlDataSet.java @@ -161,10 +161,6 @@ public class LiveCrawlDataSet implements AutoCloseable { "", headers, body, - body, - Integer.toString(body.hashCode()), - null, - "LIVE", false, "", ""