mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(model) Remove deprecated fields from CrawledDocument and CrawledDomain
This commit is contained in:
parent
d6575dfee4
commit
79ce4de2ab
@ -51,10 +51,6 @@ public class SideloaderProcessing {
|
|||||||
"NP",
|
"NP",
|
||||||
"",
|
"",
|
||||||
body,
|
body,
|
||||||
Integer.toHexString(url.hashCode()),
|
|
||||||
url,
|
|
||||||
"",
|
|
||||||
"SIDELOAD",
|
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
|
@ -147,10 +147,6 @@ public class ConvertingIntegrationTest {
|
|||||||
"",
|
"",
|
||||||
"",
|
"",
|
||||||
readClassPathFile(p.toString()),
|
readClassPathFile(p.toString()),
|
||||||
Double.toString(Math.random()),
|
|
||||||
"https://memex.marginalia.nu/" + file,
|
|
||||||
null,
|
|
||||||
"",
|
|
||||||
false,
|
false,
|
||||||
null,
|
null,
|
||||||
null
|
null
|
||||||
|
@ -144,10 +144,7 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial
|
|||||||
"",
|
"",
|
||||||
nextRecord.headers,
|
nextRecord.headers,
|
||||||
bodyString,
|
bodyString,
|
||||||
Long.toHexString(hash.hashNearlyASCII(bodyString)), // this field isn't actually used, maybe we can skip calculating it?
|
// this field isn't actually used, maybe we can skip calculating it?
|
||||||
nextRecord.url,
|
|
||||||
null,
|
|
||||||
"",
|
|
||||||
nextRecord.cookies,
|
nextRecord.cookies,
|
||||||
lastModified,
|
lastModified,
|
||||||
etag));
|
etag));
|
||||||
|
@ -4,7 +4,7 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.jetbrains.annotations.Nullable;
|
import org.jetbrains.annotations.Nullable;
|
||||||
|
|
||||||
public class CrawledDocument implements SerializableCrawlData {
|
public final class CrawledDocument implements SerializableCrawlData {
|
||||||
public String crawlId;
|
public String crawlId;
|
||||||
|
|
||||||
public String url;
|
public String url;
|
||||||
@ -21,16 +21,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
|
|
||||||
public String documentBody;
|
public String documentBody;
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public String documentBodyHash;
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public String canonicalUrl;
|
|
||||||
public String redirectUrl;
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public String recrawlState;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is not guaranteed to be set in all versions of the format,
|
* This is not guaranteed to be set in all versions of the format,
|
||||||
* information may come in CrawledDomain instead
|
* information may come in CrawledDomain instead
|
||||||
@ -40,7 +30,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
public String lastModifiedMaybe;
|
public String lastModifiedMaybe;
|
||||||
public String etagMaybe;
|
public String etagMaybe;
|
||||||
|
|
||||||
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, String documentBodyHash, String canonicalUrl, String redirectUrl, String recrawlState, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
public CrawledDocument(String crawlId, String url, String contentType, String timestamp, int httpStatus, String crawlerStatus, String crawlerStatusDesc, @Nullable String headers, String documentBody, Boolean hasCookies, String lastModifiedMaybe, String etagMaybe) {
|
||||||
this.crawlId = crawlId;
|
this.crawlId = crawlId;
|
||||||
this.url = url;
|
this.url = url;
|
||||||
this.contentType = contentType;
|
this.contentType = contentType;
|
||||||
@ -50,10 +40,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
this.crawlerStatusDesc = crawlerStatusDesc;
|
this.crawlerStatusDesc = crawlerStatusDesc;
|
||||||
this.headers = headers;
|
this.headers = headers;
|
||||||
this.documentBody = documentBody;
|
this.documentBody = documentBody;
|
||||||
this.documentBodyHash = documentBodyHash;
|
|
||||||
this.canonicalUrl = canonicalUrl;
|
|
||||||
this.redirectUrl = redirectUrl;
|
|
||||||
this.recrawlState = recrawlState;
|
|
||||||
this.hasCookies = hasCookies;
|
this.hasCookies = hasCookies;
|
||||||
this.lastModifiedMaybe = lastModifiedMaybe;
|
this.lastModifiedMaybe = lastModifiedMaybe;
|
||||||
this.etagMaybe = etagMaybe;
|
this.etagMaybe = etagMaybe;
|
||||||
@ -120,7 +106,7 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
return "CrawledDocument(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class CrawledDocumentBuilder {
|
public static class CrawledDocumentBuilder {
|
||||||
@ -133,9 +119,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
private String crawlerStatusDesc;
|
private String crawlerStatusDesc;
|
||||||
private @Nullable String headers;
|
private @Nullable String headers;
|
||||||
private String documentBody;
|
private String documentBody;
|
||||||
private String documentBodyHash;
|
|
||||||
private String canonicalUrl;
|
|
||||||
private String redirectUrl;
|
|
||||||
private String recrawlState;
|
private String recrawlState;
|
||||||
private Boolean hasCookies;
|
private Boolean hasCookies;
|
||||||
private String lastModifiedMaybe;
|
private String lastModifiedMaybe;
|
||||||
@ -189,23 +172,6 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public CrawledDocumentBuilder documentBodyHash(String documentBodyHash) {
|
|
||||||
this.documentBodyHash = documentBodyHash;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Deprecated
|
|
||||||
public CrawledDocumentBuilder canonicalUrl(String canonicalUrl) {
|
|
||||||
this.canonicalUrl = canonicalUrl;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDocumentBuilder redirectUrl(String redirectUrl) {
|
|
||||||
this.redirectUrl = redirectUrl;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Deprecated
|
@Deprecated
|
||||||
public CrawledDocumentBuilder recrawlState(String recrawlState) {
|
public CrawledDocumentBuilder recrawlState(String recrawlState) {
|
||||||
this.recrawlState = recrawlState;
|
this.recrawlState = recrawlState;
|
||||||
@ -228,11 +194,11 @@ public class CrawledDocument implements SerializableCrawlData {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public CrawledDocument build() {
|
public CrawledDocument build() {
|
||||||
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.documentBodyHash, this.canonicalUrl, this.redirectUrl, this.recrawlState, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
return new CrawledDocument(this.crawlId, this.url, this.contentType, this.timestamp, this.httpStatus, this.crawlerStatus, this.crawlerStatusDesc, this.headers, this.documentBody, this.hasCookies, this.lastModifiedMaybe, this.etagMaybe);
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", documentBodyHash=" + this.documentBodyHash + ", canonicalUrl=" + this.canonicalUrl + ", redirectUrl=" + this.redirectUrl + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
return "CrawledDocument.CrawledDocumentBuilder(crawlId=" + this.crawlId + ", url=" + this.url + ", contentType=" + this.contentType + ", timestamp=" + this.timestamp + ", httpStatus=" + this.httpStatus + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", headers=" + this.headers + ", documentBody=" + this.documentBody + ", recrawlState=" + this.recrawlState + ", hasCookies=" + this.hasCookies + ", lastModifiedMaybe=" + this.lastModifiedMaybe + ", etagMaybe=" + this.etagMaybe + ")";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,8 +1,9 @@
|
|||||||
package nu.marginalia.model.crawldata;
|
package nu.marginalia.model.crawldata;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class CrawledDomain implements SerializableCrawlData {
|
public final class CrawledDomain implements SerializableCrawlData {
|
||||||
public String domain;
|
public String domain;
|
||||||
|
|
||||||
public String redirectDomain;
|
public String redirectDomain;
|
||||||
@ -11,6 +12,7 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
public String crawlerStatusDesc;
|
public String crawlerStatusDesc;
|
||||||
public String ip;
|
public String ip;
|
||||||
|
|
||||||
|
@Deprecated // This used to be populated, but is no longer
|
||||||
public List<CrawledDocument> doc;
|
public List<CrawledDocument> doc;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -29,15 +31,6 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
this.cookies = cookies;
|
this.cookies = cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static CrawledDomainBuilder builder() {
|
|
||||||
return new CrawledDomainBuilder();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
if (doc == null) return 0;
|
|
||||||
return doc.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getDomain() {
|
public String getDomain() {
|
||||||
return this.domain;
|
return this.domain;
|
||||||
}
|
}
|
||||||
@ -94,119 +87,26 @@ public class CrawledDomain implements SerializableCrawlData {
|
|||||||
this.cookies = cookies;
|
this.cookies = cookies;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean equals(final Object o) {
|
@Override
|
||||||
if (o == this) return true;
|
public boolean equals(Object o) {
|
||||||
if (!(o instanceof CrawledDomain)) return false;
|
if (!(o instanceof CrawledDomain that)) return false;
|
||||||
final CrawledDomain other = (CrawledDomain) o;
|
|
||||||
if (!other.canEqual((Object) this)) return false;
|
return Objects.equals(domain, that.domain) && Objects.equals(redirectDomain, that.redirectDomain) && Objects.equals(crawlerStatus, that.crawlerStatus) && Objects.equals(crawlerStatusDesc, that.crawlerStatusDesc) && Objects.equals(ip, that.ip) && Objects.equals(doc, that.doc) && Objects.equals(cookies, that.cookies);
|
||||||
final Object this$domain = this.getDomain();
|
|
||||||
final Object other$domain = other.getDomain();
|
|
||||||
if (this$domain == null ? other$domain != null : !this$domain.equals(other$domain)) return false;
|
|
||||||
final Object this$redirectDomain = this.getRedirectDomain();
|
|
||||||
final Object other$redirectDomain = other.getRedirectDomain();
|
|
||||||
if (this$redirectDomain == null ? other$redirectDomain != null : !this$redirectDomain.equals(other$redirectDomain))
|
|
||||||
return false;
|
|
||||||
final Object this$crawlerStatus = this.getCrawlerStatus();
|
|
||||||
final Object other$crawlerStatus = other.getCrawlerStatus();
|
|
||||||
if (this$crawlerStatus == null ? other$crawlerStatus != null : !this$crawlerStatus.equals(other$crawlerStatus))
|
|
||||||
return false;
|
|
||||||
final Object this$crawlerStatusDesc = this.getCrawlerStatusDesc();
|
|
||||||
final Object other$crawlerStatusDesc = other.getCrawlerStatusDesc();
|
|
||||||
if (this$crawlerStatusDesc == null ? other$crawlerStatusDesc != null : !this$crawlerStatusDesc.equals(other$crawlerStatusDesc))
|
|
||||||
return false;
|
|
||||||
final Object this$ip = this.getIp();
|
|
||||||
final Object other$ip = other.getIp();
|
|
||||||
if (this$ip == null ? other$ip != null : !this$ip.equals(other$ip)) return false;
|
|
||||||
final Object this$doc = this.getDoc();
|
|
||||||
final Object other$doc = other.getDoc();
|
|
||||||
if (this$doc == null ? other$doc != null : !this$doc.equals(other$doc)) return false;
|
|
||||||
final Object this$cookies = this.getCookies();
|
|
||||||
final Object other$cookies = other.getCookies();
|
|
||||||
if (this$cookies == null ? other$cookies != null : !this$cookies.equals(other$cookies)) return false;
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected boolean canEqual(final Object other) {
|
|
||||||
return other instanceof CrawledDomain;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
final int PRIME = 59;
|
int result = Objects.hashCode(domain);
|
||||||
int result = 1;
|
result = 31 * result + Objects.hashCode(redirectDomain);
|
||||||
final Object $domain = this.getDomain();
|
result = 31 * result + Objects.hashCode(crawlerStatus);
|
||||||
result = result * PRIME + ($domain == null ? 43 : $domain.hashCode());
|
result = 31 * result + Objects.hashCode(crawlerStatusDesc);
|
||||||
final Object $redirectDomain = this.getRedirectDomain();
|
result = 31 * result + Objects.hashCode(ip);
|
||||||
result = result * PRIME + ($redirectDomain == null ? 43 : $redirectDomain.hashCode());
|
result = 31 * result + Objects.hashCode(doc);
|
||||||
final Object $crawlerStatus = this.getCrawlerStatus();
|
result = 31 * result + Objects.hashCode(cookies);
|
||||||
result = result * PRIME + ($crawlerStatus == null ? 43 : $crawlerStatus.hashCode());
|
|
||||||
final Object $crawlerStatusDesc = this.getCrawlerStatusDesc();
|
|
||||||
result = result * PRIME + ($crawlerStatusDesc == null ? 43 : $crawlerStatusDesc.hashCode());
|
|
||||||
final Object $ip = this.getIp();
|
|
||||||
result = result * PRIME + ($ip == null ? 43 : $ip.hashCode());
|
|
||||||
final Object $doc = this.getDoc();
|
|
||||||
result = result * PRIME + ($doc == null ? 43 : $doc.hashCode());
|
|
||||||
final Object $cookies = this.getCookies();
|
|
||||||
result = result * PRIME + ($cookies == null ? 43 : $cookies.hashCode());
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")";
|
return "CrawledDomain(domain=" + this.getDomain() + ", redirectDomain=" + this.getRedirectDomain() + ", crawlerStatus=" + this.getCrawlerStatus() + ", crawlerStatusDesc=" + this.getCrawlerStatusDesc() + ", ip=" + this.getIp() + ", doc=" + this.getDoc() + ", cookies=" + this.getCookies() + ")";
|
||||||
}
|
}
|
||||||
|
|
||||||
public static class CrawledDomainBuilder {
|
|
||||||
private String domain;
|
|
||||||
private String redirectDomain;
|
|
||||||
private String crawlerStatus;
|
|
||||||
private String crawlerStatusDesc;
|
|
||||||
private String ip;
|
|
||||||
private List<CrawledDocument> doc;
|
|
||||||
private List<String> cookies;
|
|
||||||
|
|
||||||
CrawledDomainBuilder() {
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder domain(String domain) {
|
|
||||||
this.domain = domain;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder redirectDomain(String redirectDomain) {
|
|
||||||
this.redirectDomain = redirectDomain;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder crawlerStatus(String crawlerStatus) {
|
|
||||||
this.crawlerStatus = crawlerStatus;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder crawlerStatusDesc(String crawlerStatusDesc) {
|
|
||||||
this.crawlerStatusDesc = crawlerStatusDesc;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder ip(String ip) {
|
|
||||||
this.ip = ip;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder doc(List<CrawledDocument> doc) {
|
|
||||||
this.doc = doc;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomainBuilder cookies(List<String> cookies) {
|
|
||||||
this.cookies = cookies;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
public CrawledDomain build() {
|
|
||||||
return new CrawledDomain(this.domain, this.redirectDomain, this.crawlerStatus, this.crawlerStatusDesc, this.ip, this.doc, this.cookies);
|
|
||||||
}
|
|
||||||
|
|
||||||
public String toString() {
|
|
||||||
return "CrawledDomain.CrawledDomainBuilder(domain=" + this.domain + ", redirectDomain=" + this.redirectDomain + ", crawlerStatus=" + this.crawlerStatus + ", crawlerStatusDesc=" + this.crawlerStatusDesc + ", ip=" + this.ip + ", doc=" + this.doc + ", cookies=" + this.cookies + ")";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
package nu.marginalia.model.crawldata;
|
package nu.marginalia.model.crawldata;
|
||||||
|
|
||||||
public interface SerializableCrawlData {
|
public sealed interface SerializableCrawlData permits CrawledDocument, CrawledDomain {
|
||||||
String getDomain();
|
String getDomain();
|
||||||
}
|
}
|
||||||
|
@ -161,10 +161,6 @@ public class LiveCrawlDataSet implements AutoCloseable {
|
|||||||
"",
|
"",
|
||||||
headers,
|
headers,
|
||||||
body,
|
body,
|
||||||
body,
|
|
||||||
Integer.toString(body.hashCode()),
|
|
||||||
null,
|
|
||||||
"LIVE",
|
|
||||||
false,
|
false,
|
||||||
"",
|
"",
|
||||||
""
|
""
|
||||||
|
Loading…
Reference in New Issue
Block a user