(model) Remove deprecated fields from CrawledDocument and CrawledDomain

This commit is contained in:
Viktor Lofgren 2024-11-20 15:27:05 +01:00
parent d6575dfee4
commit 79ce4de2ab
7 changed files with 23 additions and 172 deletions

View File

@ -51,10 +51,6 @@ public class SideloaderProcessing {
"NP",
"",
body,
Integer.toHexString(url.hashCode()),
url,
"",
"SIDELOAD",
false,
null,
null

View File

@ -147,10 +147,6 @@ public class ConvertingIntegrationTest {
"",
"",
readClassPathFile(p.toString()),
Double.toString(Math.random()),
"https://memex.marginalia.nu/" + file,
null,
"",
false,
null,
null

View File

@ -144,10 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
"",
nextRecord.headers,
bodyString,
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
nextRecord.url,
null,
"",
// this field isn't actually used, maybe we can skip calculating it?
nextRecord.cookies,
lastModified,
etag));

View File

@ -4,7 +4,7 @@ import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
public class CrawledDocument implements SerializableCrawlData {
public final class CrawledDocument implements SerializableCrawlData {
public String crawlId;
public String url;
@ -21,16 +21,6 @@ public class CrawledDocument implements SerializableCrawlData {
public String documentBody;
@Deprecated
public String documentBodyHash;
@Deprecated
public String canonicalUrl;
public String redirectUrl;
@Deprecated
public String recrawlState;
/**
* This is not guaranteed to be set in all versions of the format,
* information may come in CrawledDomain instead
@ -40,7 +30,7 @@ public class CrawledDocument implements SerializableCrawlData {
public String lastModifiedMaybe;
public String etagMaybe;
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, String documentBodyHash, String canonicalUrl, String redirectUrl, String recrawlState, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
this.crawlId = crawlId;
this.url = url;
this.contentType = contentType;
@ -50,10 +40,6 @@ public class CrawledDocument implements SerializableCrawlData {
this.crawlerStatusDesc = crawlerStatusDesc;
this.headers = headers;
this.documentBody = documentBody;
this.documentBodyHash = documentBodyHash;
this.canonicalUrl = canonicalUrl;
this.redirectUrl = redirectUrl;
this.recrawlState = recrawlState;
this.hasCookies = hasCookies;
this.lastModifiedMaybe = lastModifiedMaybe;
this.etagMaybe = etagMaybe;
@ -120,7 +106,7 @@ public class CrawledDocument implements SerializableCrawlData {
}
public String toString() {
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
}
public static class CrawledDocumentBuilder {
@ -133,9 +119,6 @@ public class CrawledDocument implements SerializableCrawlData {
private String crawlerStatusDesc;
private @Nullable String headers;
private String documentBody;
private String documentBodyHash;
private String canonicalUrl;
private String redirectUrl;
private String recrawlState;
private Boolean hasCookies;
private String lastModifiedMaybe;
@ -189,23 +172,6 @@ public class CrawledDocument implements SerializableCrawlData {
return this;
}
@Deprecated
public CrawledDocumentBuilder documentBodyHash(String documentBodyHash) {
this.documentBodyHash = documentBodyHash;
return this;
}
@Deprecated
public CrawledDocumentBuilder canonicalUrl(String canonicalUrl) {
this.canonicalUrl = canonicalUrl;
return this;
}
public CrawledDocumentBuilder redirectUrl(String redirectUrl) {
this.redirectUrl = redirectUrl;
return this;
}
@Deprecated
public CrawledDocumentBuilder recrawlState(String recrawlState) {
this.recrawlState = recrawlState;
@ -228,11 +194,11 @@ public class CrawledDocument implements SerializableCrawlData {
}
public CrawledDocument build() {
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.documentBodyHash, this.canonicalUrl, this.redirectUrl, this.recrawlState, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
}
public String toString() {
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
}
}
}

View File

@ -1,8 +1,9 @@
package nu.marginalia.model.crawldata;
import java.util.List;
import java.util.Objects;
public class CrawledDomain implements SerializableCrawlData {
public final class CrawledDomain implements SerializableCrawlData {
public String domain;
public String redirectDomain;
@ -11,6 +12,7 @@ public class CrawledDomain implements SerializableCrawlData {
public String crawlerStatusDesc;
public String ip;
@Deprecated // This used to be populated, but is no longer
public List<CrawledDocument> doc;
/**
@ -29,15 +31,6 @@ public class CrawledDomain implements SerializableCrawlData {
this.cookies = cookies;
}
public static CrawledDomainBuilder builder() {
return new CrawledDomainBuilder();
}
public int size() {
if (doc == null) return 0;
return doc.size();
}
public String getDomain() {
return this.domain;
}
@ -94,119 +87,26 @@ public class CrawledDomain implements SerializableCrawlData {
this.cookies = cookies;
}
public boolean equals(final Object o) {
if (o == this) return true;
if (!(o instanceof CrawledDomain)) return false;
final CrawledDomain other = (CrawledDomain) o;
if (!other.canEqual((Object) this)) return false;
final Object this$domain = this.getDomain();
final Object other$domain = other.getDomain();
if (this$domain == null ? other$domain != null : !this$domain.equals(other$domain)) return false;
final Object this$redirectDomain = this.getRedirectDomain();
final Object other$redirectDomain = other.getRedirectDomain();
if (this$redirectDomain == null ? other$redirectDomain != null : !this$redirectDomain.equals(other$redirectDomain))
return false;
final Object this$crawlerStatus = this.getCrawlerStatus();
final Object other$crawlerStatus = other.getCrawlerStatus();
if (this$crawlerStatus == null ? other$crawlerStatus != null : !this$crawlerStatus.equals(other$crawlerStatus))
return false;
final Object this$crawlerStatusDesc = this.getCrawlerStatusDesc();
final Object other$crawlerStatusDesc = other.getCrawlerStatusDesc();
if (this$crawlerStatusDesc == null ? other$crawlerStatusDesc != null : !this$crawlerStatusDesc.equals(other$crawlerStatusDesc))
return false;
final Object this$ip = this.getIp();
final Object other$ip = other.getIp();
if (this$ip == null ? other$ip != null : !this$ip.equals(other$ip)) return false;
final Object this$doc = this.getDoc();
final Object other$doc = other.getDoc();
if (this$doc == null ? other$doc != null : !this$doc.equals(other$doc)) return false;
final Object this$cookies = this.getCookies();
final Object other$cookies = other.getCookies();
if (this$cookies == null ? other$cookies != null : !this$cookies.equals(other$cookies)) return false;
return true;
}
protected boolean canEqual(final Object other) {
return other instanceof CrawledDomain;
@Override
public boolean equals(Object o) {
if (!(o instanceof CrawledDomain that)) return false;
return Objects.equals(domain, that.domain) && Objects.equals(redirectDomain, that.redirectDomain) && Objects.equals(crawlerStatus, that.crawlerStatus) && Objects.equals(crawlerStatusDesc, that.crawlerStatusDesc) && Objects.equals(ip, that.ip) && Objects.equals(doc, that.doc) && Objects.equals(cookies, that.cookies);
}
@Override
public int hashCode() {
final int PRIME = 59;
int result = 1;
final Object $domain = this.getDomain();
result = result * PRIME + ($domain == null ? 43 : $domain.hashCode());
final Object $redirectDomain = this.getRedirectDomain();
result = result * PRIME + ($redirectDomain == null ? 43 : $redirectDomain.hashCode());
final Object $crawlerStatus = this.getCrawlerStatus();
result = result * PRIME + ($crawlerStatus == null ? 43 : $crawlerStatus.hashCode());
final Object $crawlerStatusDesc = this.getCrawlerStatusDesc();
result = result * PRIME + ($crawlerStatusDesc == null ? 43 : $crawlerStatusDesc.hashCode());
final Object $ip = this.getIp();
result = result * PRIME + ($ip == null ? 43 : $ip.hashCode());
final Object $doc = this.getDoc();
result = result * PRIME + ($doc == null ? 43 : $doc.hashCode());
final Object $cookies = this.getCookies();
result = result * PRIME + ($cookies == null ? 43 : $cookies.hashCode());
int result = Objects.hashCode(domain);
result = 31 * result + Objects.hashCode(redirectDomain);
result = 31 * result + Objects.hashCode(crawlerStatus);
result = 31 * result + Objects.hashCode(crawlerStatusDesc);
result = 31 * result + Objects.hashCode(ip);
result = 31 * result + Objects.hashCode(doc);
result = 31 * result + Objects.hashCode(cookies);
return result;
}
public String toString() {
return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")";
}
public static class CrawledDomainBuilder {
private String domain;
private String redirectDomain;
private String crawlerStatus;
private String crawlerStatusDesc;
private String ip;
private List<CrawledDocument> doc;
private List<String> cookies;
CrawledDomainBuilder() {
}
public CrawledDomainBuilder domain(String domain) {
this.domain = domain;
return this;
}
public CrawledDomainBuilder redirectDomain(String redirectDomain) {
this.redirectDomain = redirectDomain;
return this;
}
public CrawledDomainBuilder crawlerStatus(String crawlerStatus) {
this.crawlerStatus = crawlerStatus;
return this;
}
public CrawledDomainBuilder crawlerStatusDesc(String crawlerStatusDesc) {
this.crawlerStatusDesc = crawlerStatusDesc;
return this;
}
public CrawledDomainBuilder ip(String ip) {
this.ip = ip;
return this;
}
public CrawledDomainBuilder doc(List<CrawledDocument> doc) {
this.doc = doc;
return this;
}
public CrawledDomainBuilder cookies(List<String> cookies) {
this.cookies = cookies;
return this;
}
public CrawledDomain build() {
return new CrawledDomain(this.domain, this.redirectDomain, this.crawlerStatus, this.crawlerStatusDesc, this.ip, this.doc, this.cookies);
}
public String toString() {
return "CrawledDomain.CrawledDomainBuilder(domain=" + this.domain + ", redirectDomain=" + this.redirectDomain + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", ip=" + this.ip + ", doc=" + this.doc + ", cookies=" + this.cookies + ")";
}
}
}

View File

@ -1,5 +1,5 @@
package nu.marginalia.model.crawldata;
public interface SerializableCrawlData {
public sealed interface SerializableCrawlData permits CrawledDocument, CrawledDomain {
String getDomain();
}

View File

@ -161,10 +161,6 @@ public class LiveCrawlDataSet implements AutoCloseable {
"",
headers,
body,
body,
Integer.toString(body.hashCode()),
null,
"LIVE",
false,
"",
""