(model) Remove deprecated fields from CrawledDocument and CrawledDomain

This commit is contained in:
Viktor Lofgren 2024-11-20 15:27:05 +01:00
parent d6575dfee4
commit 79ce4de2ab
7 changed files with 23 additions and 172 deletions

View File

@ -51,10 +51,6 @@ public class SideloaderProcessing {
"NP", "NP",
"", "",
body, body,
Integer.toHexString(url.hashCode()),
url,
"",
"SIDELOAD",
false, false,
null, null,
null null

View File

@ -147,10 +147,6 @@ public class ConvertingIntegrationTest {
"", "",
"", "",
readClassPathFile(p.toString()), readClassPathFile(p.toString()),
Double.toString(Math.random()),
"https://memex.marginalia.nu/" + file,
null,
"",
false, false,
null, null,
null null

View File

@ -144,10 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
"", "",
nextRecord.headers, nextRecord.headers,
bodyString, bodyString,
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it? // this field isn't actually used, maybe we can skip calculating it?
nextRecord.url,
null,
"",
nextRecord.cookies, nextRecord.cookies,
lastModified, lastModified,
etag)); etag));

View File

@ -4,7 +4,7 @@ import nu.marginalia.model.EdgeUrl;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
public class CrawledDocument implements SerializableCrawlData { public final class CrawledDocument implements SerializableCrawlData {
public String crawlId; public String crawlId;
public String url; public String url;
@ -21,16 +21,6 @@ public class CrawledDocument implements SerializableCrawlData {
public String documentBody; public String documentBody;
@Deprecated
public String documentBodyHash;
@Deprecated
public String canonicalUrl;
public String redirectUrl;
@Deprecated
public String recrawlState;
/** /**
* This is not guaranteed to be set in all versions of the format, * This is not guaranteed to be set in all versions of the format,
* information may come in CrawledDomain instead * information may come in CrawledDomain instead
@ -40,7 +30,7 @@ public class CrawledDocument implements SerializableCrawlData {
public String lastModifiedMaybe; public String lastModifiedMaybe;
public String etagMaybe; public String etagMaybe;
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, String documentBodyHash, String canonicalUrl, String redirectUrl, String recrawlState, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) { public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
this.crawlId = crawlId; this.crawlId = crawlId;
this.url = url; this.url = url;
this.contentType = contentType; this.contentType = contentType;
@ -50,10 +40,6 @@ public class CrawledDocument implements SerializableCrawlData {
this.crawlerStatusDesc = crawlerStatusDesc; this.crawlerStatusDesc = crawlerStatusDesc;
this.headers = headers; this.headers = headers;
this.documentBody = documentBody; this.documentBody = documentBody;
this.documentBodyHash = documentBodyHash;
this.canonicalUrl = canonicalUrl;
this.redirectUrl = redirectUrl;
this.recrawlState = recrawlState;
this.hasCookies = hasCookies; this.hasCookies = hasCookies;
this.lastModifiedMaybe = lastModifiedMaybe; this.lastModifiedMaybe = lastModifiedMaybe;
this.etagMaybe = etagMaybe; this.etagMaybe = etagMaybe;
@ -120,7 +106,7 @@ public class CrawledDocument implements SerializableCrawlData {
} }
public String toString() { public String toString() {
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
} }
public static class CrawledDocumentBuilder { public static class CrawledDocumentBuilder {
@ -133,9 +119,6 @@ public class CrawledDocument implements SerializableCrawlData {
private String crawlerStatusDesc; private String crawlerStatusDesc;
private @Nullable String headers; private @Nullable String headers;
private String documentBody; private String documentBody;
private String documentBodyHash;
private String canonicalUrl;
private String redirectUrl;
private String recrawlState; private String recrawlState;
private Boolean hasCookies; private Boolean hasCookies;
private String lastModifiedMaybe; private String lastModifiedMaybe;
@ -189,23 +172,6 @@ public class CrawledDocument implements SerializableCrawlData {
return this; return this;
} }
@Deprecated
public CrawledDocumentBuilder documentBodyHash(String documentBodyHash) {
this.documentBodyHash = documentBodyHash;
return this;
}
@Deprecated
public CrawledDocumentBuilder canonicalUrl(String canonicalUrl) {
this.canonicalUrl = canonicalUrl;
return this;
}
public CrawledDocumentBuilder redirectUrl(String redirectUrl) {
this.redirectUrl = redirectUrl;
return this;
}
@Deprecated @Deprecated
public CrawledDocumentBuilder recrawlState(String recrawlState) { public CrawledDocumentBuilder recrawlState(String recrawlState) {
this.recrawlState = recrawlState; this.recrawlState = recrawlState;
@ -228,11 +194,11 @@ public class CrawledDocument implements SerializableCrawlData {
} }
public CrawledDocument build() { public CrawledDocument build() {
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.documentBodyHash, this.canonicalUrl, this.redirectUrl, this.recrawlState, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe); return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
} }
public String toString() { public String toString() {
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")"; return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
} }
} }
} }

View File

@ -1,8 +1,9 @@
package nu.marginalia.model.crawldata; package nu.marginalia.model.crawldata;
import java.util.List; import java.util.List;
import java.util.Objects;
public class CrawledDomain implements SerializableCrawlData { public final class CrawledDomain implements SerializableCrawlData {
public String domain; public String domain;
public String redirectDomain; public String redirectDomain;
@ -11,6 +12,7 @@ public class CrawledDomain implements SerializableCrawlData {
public String crawlerStatusDesc; public String crawlerStatusDesc;
public String ip; public String ip;
@Deprecated // This used to be populated, but is no longer
public List<CrawledDocument> doc; public List<CrawledDocument> doc;
/** /**
@ -29,15 +31,6 @@ public class CrawledDomain implements SerializableCrawlData {
this.cookies = cookies; this.cookies = cookies;
} }
public static CrawledDomainBuilder builder() {
return new CrawledDomainBuilder();
}
public int size() {
if (doc == null) return 0;
return doc.size();
}
public String getDomain() { public String getDomain() {
return this.domain; return this.domain;
} }
@ -94,119 +87,26 @@ public class CrawledDomain implements SerializableCrawlData {
this.cookies = cookies; this.cookies = cookies;
} }
public boolean equals(final Object o) { @Override
if (o == this) return true; public boolean equals(Object o) {
if (!(o instanceof CrawledDomain)) return false; if (!(o instanceof CrawledDomain that)) return false;
final CrawledDomain other = (CrawledDomain) o;
if (!other.canEqual((Object) this)) return false; return Objects.equals(domain, that.domain) && Objects.equals(redirectDomain, that.redirectDomain) && Objects.equals(crawlerStatus, that.crawlerStatus) && Objects.equals(crawlerStatusDesc, that.crawlerStatusDesc) && Objects.equals(ip, that.ip) && Objects.equals(doc, that.doc) && Objects.equals(cookies, that.cookies);
final Object this$domain = this.getDomain();
final Object other$domain = other.getDomain();
if (this$domain == null ? other$domain != null : !this$domain.equals(other$domain)) return false;
final Object this$redirectDomain = this.getRedirectDomain();
final Object other$redirectDomain = other.getRedirectDomain();
if (this$redirectDomain == null ? other$redirectDomain != null : !this$redirectDomain.equals(other$redirectDomain))
return false;
final Object this$crawlerStatus = this.getCrawlerStatus();
final Object other$crawlerStatus = other.getCrawlerStatus();
if (this$crawlerStatus == null ? other$crawlerStatus != null : !this$crawlerStatus.equals(other$crawlerStatus))
return false;
final Object this$crawlerStatusDesc = this.getCrawlerStatusDesc();
final Object other$crawlerStatusDesc = other.getCrawlerStatusDesc();
if (this$crawlerStatusDesc == null ? other$crawlerStatusDesc != null : !this$crawlerStatusDesc.equals(other$crawlerStatusDesc))
return false;
final Object this$ip = this.getIp();
final Object other$ip = other.getIp();
if (this$ip == null ? other$ip != null : !this$ip.equals(other$ip)) return false;
final Object this$doc = this.getDoc();
final Object other$doc = other.getDoc();
if (this$doc == null ? other$doc != null : !this$doc.equals(other$doc)) return false;
final Object this$cookies = this.getCookies();
final Object other$cookies = other.getCookies();
if (this$cookies == null ? other$cookies != null : !this$cookies.equals(other$cookies)) return false;
return true;
}
protected boolean canEqual(final Object other) {
return other instanceof CrawledDomain;
} }
@Override
public int hashCode() { public int hashCode() {
final int PRIME = 59; int result = Objects.hashCode(domain);
int result = 1; result = 31 * result + Objects.hashCode(redirectDomain);
final Object $domain = this.getDomain(); result = 31 * result + Objects.hashCode(crawlerStatus);
result = result * PRIME + ($domain == null ? 43 : $domain.hashCode()); result = 31 * result + Objects.hashCode(crawlerStatusDesc);
final Object $redirectDomain = this.getRedirectDomain(); result = 31 * result + Objects.hashCode(ip);
result = result * PRIME + ($redirectDomain == null ? 43 : $redirectDomain.hashCode()); result = 31 * result + Objects.hashCode(doc);
final Object $crawlerStatus = this.getCrawlerStatus(); result = 31 * result + Objects.hashCode(cookies);
result = result * PRIME + ($crawlerStatus == null ? 43 : $crawlerStatus.hashCode());
final Object $crawlerStatusDesc = this.getCrawlerStatusDesc();
result = result * PRIME + ($crawlerStatusDesc == null ? 43 : $crawlerStatusDesc.hashCode());
final Object $ip = this.getIp();
result = result * PRIME + ($ip == null ? 43 : $ip.hashCode());
final Object $doc = this.getDoc();
result = result * PRIME + ($doc == null ? 43 : $doc.hashCode());
final Object $cookies = this.getCookies();
result = result * PRIME + ($cookies == null ? 43 : $cookies.hashCode());
return result; return result;
} }
public String toString() { public String toString() {
return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")"; return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")";
} }
public static class CrawledDomainBuilder {
private String domain;
private String redirectDomain;
private String crawlerStatus;
private String crawlerStatusDesc;
private String ip;
private List<CrawledDocument> doc;
private List<String> cookies;
CrawledDomainBuilder() {
}
public CrawledDomainBuilder domain(String domain) {
this.domain = domain;
return this;
}
public CrawledDomainBuilder redirectDomain(String redirectDomain) {
this.redirectDomain = redirectDomain;
return this;
}
public CrawledDomainBuilder crawlerStatus(String crawlerStatus) {
this.crawlerStatus = crawlerStatus;
return this;
}
public CrawledDomainBuilder crawlerStatusDesc(String crawlerStatusDesc) {
this.crawlerStatusDesc = crawlerStatusDesc;
return this;
}
public CrawledDomainBuilder ip(String ip) {
this.ip = ip;
return this;
}
public CrawledDomainBuilder doc(List<CrawledDocument> doc) {
this.doc = doc;
return this;
}
public CrawledDomainBuilder cookies(List<String> cookies) {
this.cookies = cookies;
return this;
}
public CrawledDomain build() {
return new CrawledDomain(this.domain, this.redirectDomain, this.crawlerStatus, this.crawlerStatusDesc, this.ip, this.doc, this.cookies);
}
public String toString() {
return "CrawledDomain.CrawledDomainBuilder(domain=" + this.domain + ", redirectDomain=" + this.redirectDomain + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", ip=" + this.ip + ", doc=" + this.doc + ", cookies=" + this.cookies + ")";
}
}
} }

View File

@ -1,5 +1,5 @@
package nu.marginalia.model.crawldata; package nu.marginalia.model.crawldata;
public interface SerializableCrawlData { public sealed interface SerializableCrawlData permits CrawledDocument, CrawledDomain {
String getDomain(); String getDomain();
} }

View File

@ -161,10 +161,6 @@ public class LiveCrawlDataSet implements AutoCloseable {
"", "",
headers, headers,
body, body,
body,
Integer.toString(body.hashCode()),
null,
"LIVE",
false, false,
"", "",
"" ""