mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
(chore) Remove lombok
There are likely some instances of delombok gore with this commit.
This commit is contained in:
parent
a5b4951f23
commit
9f47ce8d15
@ -1,7 +1,6 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "io.freefair.lombok" version "8.3"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
@ -44,8 +43,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=22
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
|
||||
jvmVersion=23
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
|
@ -1,10 +1,7 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.Builder;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Builder
|
||||
public class LanguageModels {
|
||||
public final Path termFrequencies;
|
||||
|
||||
@ -30,4 +27,64 @@ public class LanguageModels {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
this.segments = segments;
|
||||
}
|
||||
|
||||
public static LanguageModelsBuilder builder() {
|
||||
return new LanguageModelsBuilder();
|
||||
}
|
||||
|
||||
public static class LanguageModelsBuilder {
|
||||
private Path termFrequencies;
|
||||
private Path openNLPSentenceDetectionData;
|
||||
private Path posRules;
|
||||
private Path posDict;
|
||||
private Path openNLPTokenData;
|
||||
private Path fasttextLanguageModel;
|
||||
private Path segments;
|
||||
|
||||
LanguageModelsBuilder() {
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posRules(Path posRules) {
|
||||
this.posRules = posRules;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posDict(Path posDict) {
|
||||
this.posDict = posDict;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder openNLPTokenData(Path openNLPTokenData) {
|
||||
this.openNLPTokenData = openNLPTokenData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder segments(Path segments) {
|
||||
this.segments = segments;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModels build() {
|
||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.openNLPTokenData, this.fasttextLanguageModel, this.segments);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", openNLPTokenData=" + this.openNLPTokenData + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -7,12 +7,13 @@ import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
@ -27,7 +28,6 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@ -42,12 +42,14 @@ public class DbDomainQueries {
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw ex.getCause();
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@ -70,11 +72,13 @@ public class DbDomainQueries {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
return OptionalInt.empty();
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@ -87,5 +91,11 @@ public class DbDomainQueries {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.With;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -115,23 +114,23 @@ public class DomainRankingSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
/**
|
||||
* Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
*
|
||||
* @param name Key and name of the set
|
||||
* @param name Key and name of the set
|
||||
* @param description Human-readable description
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
* */
|
||||
@With
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
*/
|
||||
public record DomainRankingSet(String name,
|
||||
String description,
|
||||
int depth,
|
||||
String definition)
|
||||
{
|
||||
String definition) {
|
||||
|
||||
public Path fileName(Path base) {
|
||||
return base.resolve(name().toLowerCase() + ".dat");
|
||||
}
|
||||
|
||||
public String[] domains() {
|
||||
return Arrays.stream(definition().split("\n+"))
|
||||
.map(String::trim)
|
||||
@ -144,5 +143,20 @@ public class DomainRankingSetsService {
|
||||
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
|
||||
}
|
||||
|
||||
public DomainRankingSet withName(String name) {
|
||||
return this.name == name ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDescription(String description) {
|
||||
return this.description == description ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDepth(int depth) {
|
||||
return this.depth == depth ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDefinition(String definition) {
|
||||
return this.definition == definition ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,11 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeDomain implements Serializable {
|
||||
|
||||
@Nonnull
|
||||
@ -17,7 +13,6 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String topDomain;
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeDomain(String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
@ -34,28 +29,23 @@ public class EdgeDomain implements Serializable {
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||
if (dot2 < 0) {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
if (looksLikeGovTld(host))
|
||||
{ // Capture .ac.jp, .co.uk
|
||||
} else {
|
||||
if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk
|
||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||
if (dot3 >= 0) {
|
||||
dot2 = dot3;
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
@ -64,6 +54,16 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(id|ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
|
||||
public EdgeDomain(@Nonnull String subDomain, @Nonnull String topDomain) {
|
||||
this.subDomain = subDomain;
|
||||
this.topDomain = topDomain;
|
||||
}
|
||||
|
||||
public static EdgeDomainBuilder builder() {
|
||||
return new EdgeDomainBuilder();
|
||||
}
|
||||
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
@ -91,11 +91,11 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgeUrl toRootUrlHttp() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
}
|
||||
|
||||
public EdgeUrl toRootUrlHttps() {
|
||||
return new EdgeUrl("https", this, null, "/", null);
|
||||
}
|
||||
@ -125,8 +125,7 @@ public class EdgeDomain implements Serializable {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(topDomain);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
@ -155,16 +154,14 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
if (govListTest.test(topDomain)) {
|
||||
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
dot = topDomain.lastIndexOf('.');
|
||||
}
|
||||
|
||||
|
||||
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||
return "-";
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return topDomain.substring(dot + 1);
|
||||
}
|
||||
}
|
||||
@ -174,10 +171,10 @@ public class EdgeDomain implements Serializable {
|
||||
if (!(o instanceof EdgeDomain other)) return false;
|
||||
final String this$subDomain = this.getSubDomain();
|
||||
final String other$subDomain = other.getSubDomain();
|
||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||
if (!Objects.equals(this$subDomain, other$subDomain)) return false;
|
||||
final String this$domain = this.getTopDomain();
|
||||
final String other$domain = other.getTopDomain();
|
||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||
if (!Objects.equals(this$domain, other$domain)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -191,4 +188,39 @@ public class EdgeDomain implements Serializable {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getSubDomain() {
|
||||
return this.subDomain;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getTopDomain() {
|
||||
return this.topDomain;
|
||||
}
|
||||
|
||||
public static class EdgeDomainBuilder {
|
||||
private String subDomain;
|
||||
private String topDomain;
|
||||
|
||||
EdgeDomainBuilder() {
|
||||
}
|
||||
|
||||
public EdgeDomainBuilder subDomain(String subDomain) {
|
||||
this.subDomain = subDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeDomainBuilder topDomain(String topDomain) {
|
||||
this.topDomain = topDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeDomain build() {
|
||||
return new EdgeDomain(this.subDomain, this.topDomain);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "EdgeDomain.EdgeDomainBuilder(subDomain=" + this.subDomain + ", topDomain=" + this.topDomain + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,5 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.util.QueryParams;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
@ -15,7 +12,6 @@ import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
@ -38,8 +34,7 @@ public class EdgeUrl implements Serializable {
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
}
|
||||
@ -83,20 +78,17 @@ public class EdgeUrl implements Serializable {
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else if (c == '%' && i+2<end) {
|
||||
int cn = url.charAt(i+1);
|
||||
int cnn = url.charAt(i+2);
|
||||
} else if (c == '%' && i + 2 < end) {
|
||||
int cn = url.charAt(i + 1);
|
||||
int cnn = url.charAt(i + 2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
s.append("%25");
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
@ -109,7 +101,7 @@ public class EdgeUrl implements Serializable {
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx+2);
|
||||
return url.indexOf('/', colonIdx + 2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
@ -125,8 +117,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URI.getScheme().toLowerCase();
|
||||
this.port = port(URI.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URI);
|
||||
throw ex;
|
||||
}
|
||||
@ -145,8 +136,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URL.getProtocol().toLowerCase();
|
||||
this.port = port(URL.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URL.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URL);
|
||||
throw ex;
|
||||
}
|
||||
@ -158,13 +148,16 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
if (protocol.equals("http") && port == 80) {
|
||||
return null;
|
||||
}
|
||||
else if (protocol.equals("https") && port == 443) {
|
||||
} else if (protocol.equals("https") && port == 443) {
|
||||
return null;
|
||||
}
|
||||
return port;
|
||||
}
|
||||
|
||||
public static EdgeUrlBuilder builder() {
|
||||
return new EdgeUrlBuilder();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder(256);
|
||||
|
||||
@ -190,12 +183,13 @@ public class EdgeUrl implements Serializable {
|
||||
public String dir() {
|
||||
return path.replaceAll("/[^/]+$", "/");
|
||||
}
|
||||
|
||||
public String fileName() {
|
||||
return path.replaceAll(".*/", "");
|
||||
}
|
||||
|
||||
public int depth() {
|
||||
return (int) path.chars().filter(c -> c=='/').count();
|
||||
return (int) path.chars().filter(c -> c == '/').count();
|
||||
}
|
||||
|
||||
public EdgeUrl withPathAndParam(String path, String param) {
|
||||
@ -207,8 +201,8 @@ public class EdgeUrl implements Serializable {
|
||||
if (other == this) return true;
|
||||
if (other instanceof EdgeUrl e) {
|
||||
return Objects.equals(e.domain, domain)
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -235,8 +229,7 @@ public class EdgeUrl implements Serializable {
|
||||
public URL asURL() throws MalformedURLException {
|
||||
try {
|
||||
return asURI().toURL();
|
||||
}
|
||||
catch (URISyntaxException e) {
|
||||
} catch (URISyntaxException e) {
|
||||
throw new MalformedURLException(e.getMessage());
|
||||
}
|
||||
}
|
||||
@ -248,4 +241,68 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
|
||||
}
|
||||
|
||||
public String getProto() {
|
||||
return this.proto;
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
public Integer getPort() {
|
||||
return this.port;
|
||||
}
|
||||
|
||||
public String getPath() {
|
||||
return this.path;
|
||||
}
|
||||
|
||||
public String getParam() {
|
||||
return this.param;
|
||||
}
|
||||
|
||||
public static class EdgeUrlBuilder {
|
||||
private String proto;
|
||||
private EdgeDomain domain;
|
||||
private Integer port;
|
||||
private String path;
|
||||
private String param;
|
||||
|
||||
EdgeUrlBuilder() {
|
||||
}
|
||||
|
||||
public EdgeUrlBuilder proto(String proto) {
|
||||
this.proto = proto;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeUrlBuilder domain(EdgeDomain domain) {
|
||||
this.domain = domain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeUrlBuilder port(Integer port) {
|
||||
this.port = port;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeUrlBuilder path(String path) {
|
||||
this.path = path;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeUrlBuilder param(String param) {
|
||||
this.param = param;
|
||||
return this;
|
||||
}
|
||||
|
||||
public EdgeUrl build() {
|
||||
return new EdgeUrl(this.proto, this.domain, this.port, this.path, this.param);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "EdgeUrl.EdgeUrlBuilder(proto=" + this.proto + ", domain=" + this.domain + ", port=" + this.port + ", path=" + this.path + ", param=" + this.param + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
@ -21,32 +21,36 @@ class WorkLoadIterable<T> implements Iterable<T> {
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public Iterator<T> iterator() {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
try {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,12 +4,12 @@ import com.github.jknack.handlebars.*;
|
||||
import com.github.jknack.handlebars.helper.ConditionalHelpers;
|
||||
import com.github.jknack.handlebars.io.ClassPathTemplateLoader;
|
||||
import com.github.jknack.handlebars.io.TemplateLoader;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@ -42,22 +42,35 @@ public class MustacheRenderer<T> {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model) {
|
||||
return template.apply(model);
|
||||
try {
|
||||
return template.apply(model);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public <T2> String render(T model, String name, List<T2> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(name, children).build();
|
||||
|
||||
return template.apply(ctx);
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model, Map<String, ?> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(children).build();
|
||||
return template.apply(ctx);
|
||||
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -26,7 +25,6 @@ public class NodeConfigurationWatcher {
|
||||
watcherThread.start();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void pollConfiguration() {
|
||||
for (;;) {
|
||||
List<Integer> goodNodes = new ArrayList<>();
|
||||
@ -34,7 +32,7 @@ public class NodeConfigurationWatcher {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT ID FROM NODE_CONFIGURATION
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
""");
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
@ -47,7 +45,12 @@ public class NodeConfigurationWatcher {
|
||||
|
||||
queryNodes = goodNodes;
|
||||
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,13 +4,13 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.ManagedChannelBuilder;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.function.Function;
|
||||
@ -48,7 +48,12 @@ public class GrpcChannelPoolFactory {
|
||||
public <STUB> GrpcSingleNodeChannelPool<STUB> createSingle(ServiceKey<? extends PartitionTraits.Unicast> key,
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
{
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private ManagedChannel createChannel(InstanceAddress route) {
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.client;
|
||||
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@ -12,7 +11,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
|
||||
@ -29,7 +31,6 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcMultiNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<ServicePartition.Multi> serviceKey,
|
||||
Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor,
|
||||
@ -52,11 +53,16 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
}
|
||||
|
||||
private GrpcSingleNodeChannelPool<STUB> newSingleChannelPool(int node) {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the list of nodes that are eligible for broadcast-style requests */
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@ -34,11 +33,12 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
||||
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
||||
Function<ManagedChannel, STUB> stubConstructor) {
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
throws Exception
|
||||
{
|
||||
super(serviceKey);
|
||||
|
||||
this.serviceRegistryIf = serviceRegistryIf;
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service.discovery;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@ -40,18 +39,22 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
private final List<String> livenessPaths = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
@SneakyThrows
|
||||
public ZkServiceRegistry(CuratorFramework curatorFramework) {
|
||||
this.curatorFramework = curatorFramework;
|
||||
try {
|
||||
this.curatorFramework = curatorFramework;
|
||||
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to start ZkServiceRegistry", ex);
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -75,14 +78,18 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void declareFirstBoot() {
|
||||
if (!isFirstBoot()) {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to declare first-boot", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -5,14 +5,12 @@ import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.flywaydb.core.Flyway;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -71,14 +69,12 @@ public class DatabaseModule extends AbstractModule {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Singleton
|
||||
@Provides
|
||||
public HikariDataSource provideConnection() {
|
||||
return getMariaDB();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private HikariDataSource getMariaDB() {
|
||||
var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY));
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -59,13 +58,17 @@ public class Initialization {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean waitReady() {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
try {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
throw new RuntimeException("Interrupted while waiting for initialization", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.prometheus.client.exporter.MetricsServlet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.eclipse.jetty.server.Server;
|
||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||
@ -10,9 +9,8 @@ import org.eclipse.jetty.servlet.ServletHolder;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
||||
// If less than zero, we forego setting up a metrics server
|
||||
if (configuration.metricsPort() < 0)
|
||||
return;
|
||||
|
@ -1,8 +1,7 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@ -81,10 +80,14 @@ public class NodeStatusWatcher {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean isConfigured() {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
try {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Look for changes in the configuration and kill the service if the corresponding
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.prometheus.client.Counter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@ -44,11 +43,10 @@ public class Service {
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
@SneakyThrows
|
||||
public Service(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
|
||||
this.initialization = params.initialization;
|
||||
var config = params.configuration;
|
||||
@ -130,14 +128,14 @@ public class Service {
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
partition,
|
||||
grpcServices);
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params) {
|
||||
public Service(BaseServiceParams params) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
ServicePartition.any(),
|
||||
|
@ -1,20 +1,18 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
import spark.resource.ClassPathResource;
|
||||
import spark.staticfiles.MimeType;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
|
||||
public class StaticResources {
|
||||
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
|
||||
|
||||
@SneakyThrows
|
||||
public void serveStatic(String domain, String path, Request req, Response rsp) {
|
||||
try {
|
||||
if (path.startsWith("..") || domain.startsWith("..")) {
|
||||
@ -28,7 +26,7 @@ public class StaticResources {
|
||||
|
||||
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
|
||||
}
|
||||
catch (IllegalArgumentException | FileNotFoundException ex) {
|
||||
catch (IllegalArgumentException | IOException ex) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
}
|
||||
@ -57,7 +55,6 @@ public class StaticResources {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
|
||||
rsp.header("Cache-Control", "public,max-age=3600");
|
||||
rsp.type(MimeType.fromResource(resource));
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
@ -101,10 +100,14 @@ public class AtagExporter implements ExporterIf {
|
||||
continue;
|
||||
}
|
||||
|
||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||
linkOpt
|
||||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
|
||||
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
|
||||
var linkOpt = linkParser
|
||||
.parseLinkPermissive(baseUrl, atag)
|
||||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText));
|
||||
|
||||
if (linkOpt.isPresent()) {
|
||||
var url = linkOpt.get();
|
||||
exporter.accept(url, baseUrl.domain, linkText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -167,8 +170,7 @@ public class AtagExporter implements ExporterIf {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
|
||||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) throws IOException {
|
||||
final String urlString = urlWithNoSchema(url);
|
||||
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
@ -115,12 +114,16 @@ public class FeedExporter implements ExporterIf {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeDomain domain, int size, EdgeUrl path) {
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(domain),
|
||||
csvify(size),
|
||||
csvify(path)));
|
||||
try {
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(domain),
|
||||
csvify(size),
|
||||
csvify(path)));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static String csvify(Object field) {
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.actor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.actor.monitor.FileStorageMonitorActor;
|
||||
import nu.marginalia.actor.proc.*;
|
||||
import nu.marginalia.actor.prototype.ActorPrototype;
|
||||
@ -13,6 +12,8 @@ import nu.marginalia.actor.task.*;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@ -27,6 +28,8 @@ public class ExecutorActorControlService {
|
||||
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
|
||||
private final int node;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
|
||||
BaseServiceParams baseServiceParams,
|
||||
@ -119,11 +122,15 @@ public class ExecutorActorControlService {
|
||||
stateMachines.startFromJSON(process, state, json);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void stop(ExecutorActor process) {
|
||||
eventLog.logEvent("FSM-STOP", process.id());
|
||||
|
||||
stateMachines.stop(process);
|
||||
try {
|
||||
stateMachines.stop(process);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to stop FSM", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<ExecutorActor, ActorStateInstance> getActorStates() {
|
||||
|
@ -3,9 +3,6 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.With;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
@ -40,7 +37,6 @@ import java.util.List;
|
||||
public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
|
||||
// STATES
|
||||
|
||||
public static final String RERANK = "RERANK";
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox mqConverterOutbox;
|
||||
@ -54,15 +50,6 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
private final int nodeId;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
||||
@AllArgsConstructor @With @NoArgsConstructor
|
||||
public static class Message {
|
||||
public FileStorageId crawlStorageId = null;
|
||||
public List<FileStorageId> processedStorageId = null;
|
||||
public long converterMsgId = 0L;
|
||||
public long loaderMsgId = 0L;
|
||||
}
|
||||
|
||||
public record Initial(FileStorageId fid) implements ActorStep {}
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
@ -228,13 +227,17 @@ public class ExecutorGrpcService
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private RpcFileStorageEntry createFileModel(Path path) {
|
||||
return RpcFileStorageEntry.newBuilder()
|
||||
.setName(path.toFile().getName())
|
||||
.setSize(Files.size(path))
|
||||
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
|
||||
.build();
|
||||
try {
|
||||
return RpcFileStorageEntry.newBuilder()
|
||||
.setName(path.toFile().getName())
|
||||
.setSize(Files.size(path))
|
||||
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
|
||||
.build();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.screenshot;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
@ -11,6 +10,7 @@ import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
import static java.lang.Integer.parseInt;
|
||||
@ -48,7 +48,6 @@ public class ScreenshotService {
|
||||
return false;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Object serveScreenshotRequest(Request request, Response response) {
|
||||
if (Strings.isNullOrEmpty(request.params("id"))) {
|
||||
response.redirect("https://search.marginalia.nu/");
|
||||
@ -75,6 +74,9 @@ public class ScreenshotService {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.warn("IO error", ex);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.api.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -28,18 +28,22 @@ public class DomainsProtobufCodec {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static SimilarDomain convertResponseEntry(RpcSimilarDomain sd) {
|
||||
return new SimilarDomain(
|
||||
new EdgeUrl(sd.getUrl()),
|
||||
sd.getDomainId(),
|
||||
sd.getRelatedness(),
|
||||
sd.getRank(),
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
try {
|
||||
return new SimilarDomain(
|
||||
new EdgeUrl(sd.getUrl()),
|
||||
sd.getDomainId(),
|
||||
sd.getRelatedness(),
|
||||
sd.getRank(),
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,10 +1,7 @@
|
||||
package nu.marginalia.api.domains.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
|
||||
@ToString
|
||||
public class DomainInformation {
|
||||
EdgeDomain domain;
|
||||
|
||||
@ -29,6 +26,34 @@ public class DomainInformation {
|
||||
String ipCountry;
|
||||
String state;
|
||||
|
||||
public DomainInformation(EdgeDomain domain, boolean blacklisted, int pagesKnown, int pagesFetched, int pagesIndexed, int incomingLinks, int outboundLinks, int nodeAffinity, double ranking, boolean suggestForCrawling, boolean inCrawlQueue, boolean unknownDomain, String ip, Integer asn, String asnOrg, String asnCountry, String ipCountry, String state) {
|
||||
this.domain = domain;
|
||||
this.blacklisted = blacklisted;
|
||||
this.pagesKnown = pagesKnown;
|
||||
this.pagesFetched = pagesFetched;
|
||||
this.pagesIndexed = pagesIndexed;
|
||||
this.incomingLinks = incomingLinks;
|
||||
this.outboundLinks = outboundLinks;
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
this.ranking = ranking;
|
||||
this.suggestForCrawling = suggestForCrawling;
|
||||
this.inCrawlQueue = inCrawlQueue;
|
||||
this.unknownDomain = unknownDomain;
|
||||
this.ip = ip;
|
||||
this.asn = asn;
|
||||
this.asnOrg = asnOrg;
|
||||
this.asnCountry = asnCountry;
|
||||
this.ipCountry = ipCountry;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public DomainInformation() {
|
||||
}
|
||||
|
||||
public static DomainInformationBuilder builder() {
|
||||
return new DomainInformationBuilder();
|
||||
}
|
||||
|
||||
public String getIpFlag() {
|
||||
if (ipCountry == null || ipCountry.codePointCount(0, ipCountry.length()) != 2) {
|
||||
return "";
|
||||
@ -45,4 +70,202 @@ public class DomainInformation {
|
||||
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
public boolean isBlacklisted() {
|
||||
return this.blacklisted;
|
||||
}
|
||||
|
||||
public int getPagesKnown() {
|
||||
return this.pagesKnown;
|
||||
}
|
||||
|
||||
public int getPagesFetched() {
|
||||
return this.pagesFetched;
|
||||
}
|
||||
|
||||
public int getPagesIndexed() {
|
||||
return this.pagesIndexed;
|
||||
}
|
||||
|
||||
public int getIncomingLinks() {
|
||||
return this.incomingLinks;
|
||||
}
|
||||
|
||||
public int getOutboundLinks() {
|
||||
return this.outboundLinks;
|
||||
}
|
||||
|
||||
public int getNodeAffinity() {
|
||||
return this.nodeAffinity;
|
||||
}
|
||||
|
||||
public double getRanking() {
|
||||
return this.ranking;
|
||||
}
|
||||
|
||||
public boolean isSuggestForCrawling() {
|
||||
return this.suggestForCrawling;
|
||||
}
|
||||
|
||||
public boolean isInCrawlQueue() {
|
||||
return this.inCrawlQueue;
|
||||
}
|
||||
|
||||
public boolean isUnknownDomain() {
|
||||
return this.unknownDomain;
|
||||
}
|
||||
|
||||
public String getIp() {
|
||||
return this.ip;
|
||||
}
|
||||
|
||||
public Integer getAsn() {
|
||||
return this.asn;
|
||||
}
|
||||
|
||||
public String getAsnOrg() {
|
||||
return this.asnOrg;
|
||||
}
|
||||
|
||||
public String getAsnCountry() {
|
||||
return this.asnCountry;
|
||||
}
|
||||
|
||||
public String getIpCountry() {
|
||||
return this.ipCountry;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return this.state;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DomainInformation(domain=" + this.getDomain() + ", blacklisted=" + this.isBlacklisted() + ", pagesKnown=" + this.getPagesKnown() + ", pagesFetched=" + this.getPagesFetched() + ", pagesIndexed=" + this.getPagesIndexed() + ", incomingLinks=" + this.getIncomingLinks() + ", outboundLinks=" + this.getOutboundLinks() + ", nodeAffinity=" + this.getNodeAffinity() + ", ranking=" + this.getRanking() + ", suggestForCrawling=" + this.isSuggestForCrawling() + ", inCrawlQueue=" + this.isInCrawlQueue() + ", unknownDomain=" + this.isUnknownDomain() + ", ip=" + this.getIp() + ", asn=" + this.getAsn() + ", asnOrg=" + this.getAsnOrg() + ", asnCountry=" + this.getAsnCountry() + ", ipCountry=" + this.getIpCountry() + ", state=" + this.getState() + ")";
|
||||
}
|
||||
|
||||
public static class DomainInformationBuilder {
|
||||
private EdgeDomain domain;
|
||||
private boolean blacklisted;
|
||||
private int pagesKnown;
|
||||
private int pagesFetched;
|
||||
private int pagesIndexed;
|
||||
private int incomingLinks;
|
||||
private int outboundLinks;
|
||||
private int nodeAffinity;
|
||||
private double ranking;
|
||||
private boolean suggestForCrawling;
|
||||
private boolean inCrawlQueue;
|
||||
private boolean unknownDomain;
|
||||
private String ip;
|
||||
private Integer asn;
|
||||
private String asnOrg;
|
||||
private String asnCountry;
|
||||
private String ipCountry;
|
||||
private String state;
|
||||
|
||||
DomainInformationBuilder() {
|
||||
}
|
||||
|
||||
public DomainInformationBuilder domain(EdgeDomain domain) {
|
||||
this.domain = domain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder blacklisted(boolean blacklisted) {
|
||||
this.blacklisted = blacklisted;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesKnown(int pagesKnown) {
|
||||
this.pagesKnown = pagesKnown;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesFetched(int pagesFetched) {
|
||||
this.pagesFetched = pagesFetched;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesIndexed(int pagesIndexed) {
|
||||
this.pagesIndexed = pagesIndexed;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder incomingLinks(int incomingLinks) {
|
||||
this.incomingLinks = incomingLinks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder outboundLinks(int outboundLinks) {
|
||||
this.outboundLinks = outboundLinks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder nodeAffinity(int nodeAffinity) {
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ranking(double ranking) {
|
||||
this.ranking = ranking;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder suggestForCrawling(boolean suggestForCrawling) {
|
||||
this.suggestForCrawling = suggestForCrawling;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder inCrawlQueue(boolean inCrawlQueue) {
|
||||
this.inCrawlQueue = inCrawlQueue;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder unknownDomain(boolean unknownDomain) {
|
||||
this.unknownDomain = unknownDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ip(String ip) {
|
||||
this.ip = ip;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asn(Integer asn) {
|
||||
this.asn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asnOrg(String asnOrg) {
|
||||
this.asnOrg = asnOrg;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asnCountry(String asnCountry) {
|
||||
this.asnCountry = asnCountry;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ipCountry(String ipCountry) {
|
||||
this.ipCountry = ipCountry;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder state(String state) {
|
||||
this.state = state;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformation build() {
|
||||
return new DomainInformation(this.domain, this.blacklisted, this.pagesKnown, this.pagesFetched, this.pagesIndexed, this.incomingLinks, this.outboundLinks, this.nodeAffinity, this.ranking, this.suggestForCrawling, this.inCrawlQueue, this.unknownDomain, this.ip, this.asn, this.asnOrg, this.asnCountry, this.ipCountry, this.state);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DomainInformation.DomainInformationBuilder(domain=" + this.domain + ", blacklisted=" + this.blacklisted + ", pagesKnown=" + this.pagesKnown + ", pagesFetched=" + this.pagesFetched + ", pagesIndexed=" + this.pagesIndexed + ", incomingLinks=" + this.incomingLinks + ", outboundLinks=" + this.outboundLinks + ", nodeAffinity=" + this.nodeAffinity + ", ranking=" + this.ranking + ", suggestForCrawling=" + this.suggestForCrawling + ", inCrawlQueue=" + this.inCrawlQueue + ", unknownDomain=" + this.unknownDomain + ", ip=" + this.ip + ", asn=" + this.asn + ", asnOrg=" + this.asnOrg + ", asnCountry=" + this.asnCountry + ", ipCountry=" + this.ipCountry + ", state=" + this.state + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,29 @@
|
||||
package nu.marginalia.api.math.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@ToString
|
||||
public class DictionaryEntry {
|
||||
public final String type;
|
||||
public final String word;
|
||||
public final String definition;
|
||||
|
||||
public DictionaryEntry(String type, String word, String definition) {
|
||||
this.type = type;
|
||||
this.word = word;
|
||||
this.definition = definition;
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return this.type;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return this.word;
|
||||
}
|
||||
|
||||
public String getDefinition() {
|
||||
return this.definition;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DictionaryEntry(type=" + this.getType() + ", word=" + this.getWord() + ", definition=" + this.getDefinition() + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,28 @@
|
||||
package nu.marginalia.api.math.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ToString @Getter @AllArgsConstructor @NoArgsConstructor
|
||||
public class DictionaryResponse {
|
||||
public String word;
|
||||
public List<DictionaryEntry> entries;
|
||||
|
||||
public DictionaryResponse(String word, List<DictionaryEntry> entries) {
|
||||
this.word = word;
|
||||
this.entries = entries;
|
||||
}
|
||||
|
||||
public DictionaryResponse() {
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return this.word;
|
||||
}
|
||||
|
||||
public List<DictionaryEntry> getEntries() {
|
||||
return this.entries;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DictionaryResponse(word=" + this.getWord() + ", entries=" + this.getEntries() + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,10 +1,7 @@
|
||||
package nu.marginalia.functions.math.eval;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import java.math.RoundingMode;
|
||||
import java.text.DecimalFormat;
|
||||
import java.text.NumberFormat;
|
||||
@ -44,7 +41,6 @@ public class MathParser {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public double eval(String inputExpression) throws ParseException {
|
||||
if (isTrivial.test(inputExpression)) {
|
||||
return Double.parseDouble(inputExpression);
|
||||
@ -243,10 +239,13 @@ public class MathParser {
|
||||
}
|
||||
}
|
||||
|
||||
@AllArgsConstructor @ToString
|
||||
class Token {
|
||||
public final char tokenType;
|
||||
|
||||
public Token(char tokenType) {
|
||||
this.tokenType = tokenType;
|
||||
}
|
||||
|
||||
public double evaluate() {
|
||||
throw new IllegalArgumentException("Can't evaluate" + this);
|
||||
}
|
||||
@ -254,9 +253,12 @@ class Token {
|
||||
public void transform(Function<List<Token>, List<Token>> mapper) {
|
||||
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Token(tokenType=" + this.tokenType + ")";
|
||||
}
|
||||
}
|
||||
|
||||
@ToString
|
||||
class StringToken extends Token {
|
||||
public final String value;
|
||||
|
||||
@ -274,6 +276,10 @@ class StringToken extends Token {
|
||||
|
||||
return Double.parseDouble(value);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "StringToken(value=" + this.value + ")";
|
||||
}
|
||||
}
|
||||
|
||||
class UniExpression extends Token {
|
||||
@ -302,7 +308,6 @@ class UniExpression extends Token {
|
||||
}
|
||||
}
|
||||
|
||||
@ToString
|
||||
class GroupExpression extends Token {
|
||||
public List<Token> argument;
|
||||
|
||||
@ -323,6 +328,10 @@ class GroupExpression extends Token {
|
||||
public void transform(Function<List<Token>, List<Token>> mapper) {
|
||||
argument = mapper.apply(argument);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "GroupExpression(argument=" + this.argument + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.functions.math.eval;
|
||||
|
||||
import com.opencsv.CSVReader;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.opencsv.CSVReader;
|
||||
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DecimalFormat;
|
||||
@ -19,7 +18,6 @@ public class Units {
|
||||
private final Map<String, Unit> unitsByName = new HashMap<>();
|
||||
private final MathParser mathParser;
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public Units(MathParser mathParser) {
|
||||
this.mathParser = mathParser;
|
||||
@ -41,6 +39,9 @@ public class Units {
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
@ -128,24 +127,28 @@ public class QueryProtobufCodec {
|
||||
);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static DecoratedSearchResultItem convertDecoratedResult(RpcDecoratedResultItem results) {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(results.getRawItem()),
|
||||
new EdgeUrl(results.getUrl()),
|
||||
results.getTitle(),
|
||||
results.getDescription(),
|
||||
results.getUrlQuality(),
|
||||
results.getFormat(),
|
||||
results.getFeatures(),
|
||||
results.getPubYear(), // ??,
|
||||
results.getDataHash(),
|
||||
results.getWordsTotal(),
|
||||
results.getBestPositions(),
|
||||
results.getRankingScore(),
|
||||
results.getResultsFromDomain(),
|
||||
convertRankingDetails(results.getRankingDetails())
|
||||
);
|
||||
try {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(results.getRawItem()),
|
||||
new EdgeUrl(results.getUrl()),
|
||||
results.getTitle(),
|
||||
results.getDescription(),
|
||||
results.getUrlQuality(),
|
||||
results.getFormat(),
|
||||
results.getFeatures(),
|
||||
results.getPubYear(), // ??,
|
||||
results.getDataHash(),
|
||||
results.getWordsTotal(),
|
||||
results.getBestPositions(),
|
||||
results.getRankingScore(),
|
||||
results.getResultsFromDomain(),
|
||||
convertRankingDetails(results.getRankingDetails())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to convert result", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
|
||||
@ -325,24 +328,28 @@ public class QueryProtobufCodec {
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static DecoratedSearchResultItem convertQueryResult(RpcDecoratedResultItem rpcDecoratedResultItem) {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(rpcDecoratedResultItem.getRawItem()),
|
||||
new EdgeUrl(rpcDecoratedResultItem.getUrl()),
|
||||
rpcDecoratedResultItem.getTitle(),
|
||||
rpcDecoratedResultItem.getDescription(),
|
||||
rpcDecoratedResultItem.getUrlQuality(),
|
||||
rpcDecoratedResultItem.getFormat(),
|
||||
rpcDecoratedResultItem.getFeatures(),
|
||||
rpcDecoratedResultItem.getPubYear(),
|
||||
rpcDecoratedResultItem.getDataHash(),
|
||||
rpcDecoratedResultItem.getWordsTotal(),
|
||||
rpcDecoratedResultItem.getBestPositions(),
|
||||
rpcDecoratedResultItem.getRankingScore(),
|
||||
rpcDecoratedResultItem.getResultsFromDomain(),
|
||||
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
|
||||
);
|
||||
try {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(rpcDecoratedResultItem.getRawItem()),
|
||||
new EdgeUrl(rpcDecoratedResultItem.getUrl()),
|
||||
rpcDecoratedResultItem.getTitle(),
|
||||
rpcDecoratedResultItem.getDescription(),
|
||||
rpcDecoratedResultItem.getUrlQuality(),
|
||||
rpcDecoratedResultItem.getFormat(),
|
||||
rpcDecoratedResultItem.getFeatures(),
|
||||
rpcDecoratedResultItem.getPubYear(),
|
||||
rpcDecoratedResultItem.getDataHash(),
|
||||
rpcDecoratedResultItem.getWordsTotal(),
|
||||
rpcDecoratedResultItem.getBestPositions(),
|
||||
rpcDecoratedResultItem.getRankingScore(),
|
||||
rpcDecoratedResultItem.getResultsFromDomain(),
|
||||
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to convert result", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,41 +1,53 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.With;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@With
|
||||
@EqualsAndHashCode
|
||||
public class SearchQuery {
|
||||
|
||||
/** An infix style expression that encodes the required terms in the query */
|
||||
/**
|
||||
* An infix style expression that encodes the required terms in the query
|
||||
*/
|
||||
public final String compiledQuery;
|
||||
|
||||
/** All terms that appear in {@see compiledQuery} */
|
||||
/**
|
||||
* All terms that appear in {@see compiledQuery}
|
||||
*/
|
||||
public final List<String> searchTermsInclude;
|
||||
|
||||
/** These terms must be absent from the document */
|
||||
/**
|
||||
* These terms must be absent from the document
|
||||
*/
|
||||
public final List<String> searchTermsExclude;
|
||||
|
||||
/** These terms must be present in the document, but are not used in ranking */
|
||||
/**
|
||||
* These terms must be present in the document, but are not used in ranking
|
||||
*/
|
||||
public final List<String> searchTermsAdvice;
|
||||
|
||||
/** If these optional terms are present in the document, rank it highly */
|
||||
/**
|
||||
* If these optional terms are present in the document, rank it highly
|
||||
*/
|
||||
public final List<String> searchTermsPriority;
|
||||
|
||||
/** Terms that we require to be in the same sentence */
|
||||
/**
|
||||
* Terms that we require to be in the same sentence
|
||||
*/
|
||||
public final List<SearchPhraseConstraint> phraseConstraints;
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
private double value = 0;
|
||||
|
||||
public SearchQuery(String compiledQuery, List<String> searchTermsInclude, List<String> searchTermsExclude, List<String> searchTermsAdvice, List<String> searchTermsPriority, List<SearchPhraseConstraint> phraseConstraints, double value) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
this.searchTermsPriority = searchTermsPriority;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static SearchQueryBuilder builder() {
|
||||
return new SearchQueryBuilder();
|
||||
}
|
||||
@ -77,14 +89,132 @@ public class SearchQuery {
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", ");
|
||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
if (!searchTermsExclude.isEmpty())
|
||||
sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty())
|
||||
sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty())
|
||||
sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!phraseConstraints.isEmpty())
|
||||
sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh -> coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getCompiledQuery() {
|
||||
return this.compiledQuery;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsInclude() {
|
||||
return this.searchTermsInclude;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsExclude() {
|
||||
return this.searchTermsExclude;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsAdvice() {
|
||||
return this.searchTermsAdvice;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsPriority() {
|
||||
return this.searchTermsPriority;
|
||||
}
|
||||
|
||||
public List<SearchPhraseConstraint> getPhraseConstraints() {
|
||||
return this.phraseConstraints;
|
||||
}
|
||||
|
||||
@Deprecated
|
||||
public double getValue() {
|
||||
return this.value;
|
||||
}
|
||||
|
||||
public SearchQuery withCompiledQuery(String compiledQuery) {
|
||||
return this.compiledQuery == compiledQuery ? this : new SearchQuery(compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
|
||||
}
|
||||
|
||||
public SearchQuery withSearchTermsInclude(List<String> searchTermsInclude) {
|
||||
return this.searchTermsInclude == searchTermsInclude ? this : new SearchQuery(this.compiledQuery, searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
|
||||
}
|
||||
|
||||
public SearchQuery withSearchTermsExclude(List<String> searchTermsExclude) {
|
||||
return this.searchTermsExclude == searchTermsExclude ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
|
||||
}
|
||||
|
||||
public SearchQuery withSearchTermsAdvice(List<String> searchTermsAdvice) {
|
||||
return this.searchTermsAdvice == searchTermsAdvice ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
|
||||
}
|
||||
|
||||
public SearchQuery withSearchTermsPriority(List<String> searchTermsPriority) {
|
||||
return this.searchTermsPriority == searchTermsPriority ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, searchTermsPriority, this.phraseConstraints, this.value);
|
||||
}
|
||||
|
||||
public SearchQuery withPhraseConstraints(List<SearchPhraseConstraint> phraseConstraints) {
|
||||
return this.phraseConstraints == phraseConstraints ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, phraseConstraints, this.value);
|
||||
}
|
||||
|
||||
public SearchQuery withValue(double value) {
|
||||
return this.value == value ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, value);
|
||||
}
|
||||
|
||||
public boolean equals(final Object o) {
|
||||
if (o == this) return true;
|
||||
if (!(o instanceof SearchQuery)) return false;
|
||||
final SearchQuery other = (SearchQuery) o;
|
||||
if (!other.canEqual((Object) this)) return false;
|
||||
final Object this$compiledQuery = this.getCompiledQuery();
|
||||
final Object other$compiledQuery = other.getCompiledQuery();
|
||||
if (this$compiledQuery == null ? other$compiledQuery != null : !this$compiledQuery.equals(other$compiledQuery))
|
||||
return false;
|
||||
final Object this$searchTermsInclude = this.getSearchTermsInclude();
|
||||
final Object other$searchTermsInclude = other.getSearchTermsInclude();
|
||||
if (this$searchTermsInclude == null ? other$searchTermsInclude != null : !this$searchTermsInclude.equals(other$searchTermsInclude))
|
||||
return false;
|
||||
final Object this$searchTermsExclude = this.getSearchTermsExclude();
|
||||
final Object other$searchTermsExclude = other.getSearchTermsExclude();
|
||||
if (this$searchTermsExclude == null ? other$searchTermsExclude != null : !this$searchTermsExclude.equals(other$searchTermsExclude))
|
||||
return false;
|
||||
final Object this$searchTermsAdvice = this.getSearchTermsAdvice();
|
||||
final Object other$searchTermsAdvice = other.getSearchTermsAdvice();
|
||||
if (this$searchTermsAdvice == null ? other$searchTermsAdvice != null : !this$searchTermsAdvice.equals(other$searchTermsAdvice))
|
||||
return false;
|
||||
final Object this$searchTermsPriority = this.getSearchTermsPriority();
|
||||
final Object other$searchTermsPriority = other.getSearchTermsPriority();
|
||||
if (this$searchTermsPriority == null ? other$searchTermsPriority != null : !this$searchTermsPriority.equals(other$searchTermsPriority))
|
||||
return false;
|
||||
final Object this$phraseConstraints = this.getPhraseConstraints();
|
||||
final Object other$phraseConstraints = other.getPhraseConstraints();
|
||||
if (this$phraseConstraints == null ? other$phraseConstraints != null : !this$phraseConstraints.equals(other$phraseConstraints))
|
||||
return false;
|
||||
if (Double.compare(this.getValue(), other.getValue()) != 0) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean canEqual(final Object other) {
|
||||
return other instanceof SearchQuery;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
final int PRIME = 59;
|
||||
int result = 1;
|
||||
final Object $compiledQuery = this.getCompiledQuery();
|
||||
result = result * PRIME + ($compiledQuery == null ? 43 : $compiledQuery.hashCode());
|
||||
final Object $searchTermsInclude = this.getSearchTermsInclude();
|
||||
result = result * PRIME + ($searchTermsInclude == null ? 43 : $searchTermsInclude.hashCode());
|
||||
final Object $searchTermsExclude = this.getSearchTermsExclude();
|
||||
result = result * PRIME + ($searchTermsExclude == null ? 43 : $searchTermsExclude.hashCode());
|
||||
final Object $searchTermsAdvice = this.getSearchTermsAdvice();
|
||||
result = result * PRIME + ($searchTermsAdvice == null ? 43 : $searchTermsAdvice.hashCode());
|
||||
final Object $searchTermsPriority = this.getSearchTermsPriority();
|
||||
result = result * PRIME + ($searchTermsPriority == null ? 43 : $searchTermsPriority.hashCode());
|
||||
final Object $phraseConstraints = this.getPhraseConstraints();
|
||||
result = result * PRIME + ($phraseConstraints == null ? 43 : $phraseConstraints.hashCode());
|
||||
final long $value = Double.doubleToLongBits(this.getValue());
|
||||
result = result * PRIME + (int) ($value >>> 32 ^ $value);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static class SearchQueryBuilder {
|
||||
private String compiledQuery;
|
||||
public final List<String> searchTermsInclude = new ArrayList<>();
|
||||
@ -130,7 +260,9 @@ public class SearchQuery {
|
||||
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
|
||||
}
|
||||
|
||||
/** If there are no ranking terms, promote the advice terms to ranking terms */
|
||||
/**
|
||||
* If there are no ranking terms, promote the advice terms to ranking terms
|
||||
*/
|
||||
public void promoteNonRankingTerms() {
|
||||
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
|
||||
searchTermsInclude.addAll(searchTermsAdvice);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
@ -8,29 +7,207 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||
public class SearchSpecification {
|
||||
public SearchQuery query;
|
||||
|
||||
/** If present and not empty, limit the search to these domain IDs */
|
||||
/**
|
||||
* If present and not empty, limit the search to these domain IDs
|
||||
*/
|
||||
public List<Integer> domains;
|
||||
|
||||
public String searchSetIdentifier;
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
@Builder.Default
|
||||
public final SpecificationLimit quality = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit year = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit size = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit rank = SpecificationLimit.none();
|
||||
public SpecificationLimit quality;
|
||||
public SpecificationLimit year;
|
||||
public SpecificationLimit size;
|
||||
public SpecificationLimit rank;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
||||
public final ResultRankingParameters rankingParams;
|
||||
|
||||
public SearchSpecification(SearchQuery query,
|
||||
List<Integer> domains,
|
||||
String searchSetIdentifier,
|
||||
String humanQuery,
|
||||
SpecificationLimit quality,
|
||||
SpecificationLimit year,
|
||||
SpecificationLimit size,
|
||||
SpecificationLimit rank,
|
||||
QueryLimits queryLimits,
|
||||
QueryStrategy queryStrategy,
|
||||
ResultRankingParameters rankingParams)
|
||||
{
|
||||
this.query = query;
|
||||
this.domains = domains;
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
this.humanQuery = humanQuery;
|
||||
this.quality = quality;
|
||||
this.year = year;
|
||||
this.size = size;
|
||||
this.rank = rank;
|
||||
this.queryLimits = queryLimits;
|
||||
this.queryStrategy = queryStrategy;
|
||||
this.rankingParams = rankingParams;
|
||||
}
|
||||
|
||||
public static SearchSpecificationBuilder builder() {
|
||||
return new SearchSpecificationBuilder();
|
||||
}
|
||||
|
||||
public SearchQuery getQuery() {
|
||||
return this.query;
|
||||
}
|
||||
|
||||
public List<Integer> getDomains() {
|
||||
return this.domains;
|
||||
}
|
||||
|
||||
public String getSearchSetIdentifier() {
|
||||
return this.searchSetIdentifier;
|
||||
}
|
||||
|
||||
public String getHumanQuery() {
|
||||
return this.humanQuery;
|
||||
}
|
||||
|
||||
public SpecificationLimit getQuality() {
|
||||
return this.quality;
|
||||
}
|
||||
|
||||
public SpecificationLimit getYear() {
|
||||
return this.year;
|
||||
}
|
||||
|
||||
public SpecificationLimit getSize() {
|
||||
return this.size;
|
||||
}
|
||||
|
||||
public SpecificationLimit getRank() {
|
||||
return this.rank;
|
||||
}
|
||||
|
||||
public QueryLimits getQueryLimits() {
|
||||
return this.queryLimits;
|
||||
}
|
||||
|
||||
public QueryStrategy getQueryStrategy() {
|
||||
return this.queryStrategy;
|
||||
}
|
||||
|
||||
public ResultRankingParameters getRankingParams() {
|
||||
return this.rankingParams;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
||||
}
|
||||
|
||||
public static class SearchSpecificationBuilder {
|
||||
private SearchQuery query;
|
||||
private List<Integer> domains;
|
||||
private String searchSetIdentifier;
|
||||
private String humanQuery;
|
||||
private SpecificationLimit quality$value;
|
||||
private boolean quality$set;
|
||||
private SpecificationLimit year$value;
|
||||
private boolean year$set;
|
||||
private SpecificationLimit size$value;
|
||||
private boolean size$set;
|
||||
private SpecificationLimit rank$value;
|
||||
private boolean rank$set;
|
||||
private QueryLimits queryLimits;
|
||||
private QueryStrategy queryStrategy;
|
||||
private ResultRankingParameters rankingParams;
|
||||
|
||||
SearchSpecificationBuilder() {
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder query(SearchQuery query) {
|
||||
this.query = query;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder domains(List<Integer> domains) {
|
||||
this.domains = domains;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder searchSetIdentifier(String searchSetIdentifier) {
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder humanQuery(String humanQuery) {
|
||||
this.humanQuery = humanQuery;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
||||
this.quality$value = quality;
|
||||
this.quality$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder year(SpecificationLimit year) {
|
||||
this.year$value = year;
|
||||
this.year$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder size(SpecificationLimit size) {
|
||||
this.size$value = size;
|
||||
this.size$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder rank(SpecificationLimit rank) {
|
||||
this.rank$value = rank;
|
||||
this.rank$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder queryLimits(QueryLimits queryLimits) {
|
||||
this.queryLimits = queryLimits;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder queryStrategy(QueryStrategy queryStrategy) {
|
||||
this.queryStrategy = queryStrategy;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder rankingParams(ResultRankingParameters rankingParams) {
|
||||
this.rankingParams = rankingParams;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecification build() {
|
||||
SpecificationLimit quality$value = this.quality$value;
|
||||
if (!this.quality$set) {
|
||||
quality$value = SpecificationLimit.none();
|
||||
}
|
||||
SpecificationLimit year$value = this.year$value;
|
||||
if (!this.year$set) {
|
||||
year$value = SpecificationLimit.none();
|
||||
}
|
||||
SpecificationLimit size$value = this.size$value;
|
||||
if (!this.size$set) {
|
||||
size$value = SpecificationLimit.none();
|
||||
}
|
||||
SpecificationLimit rank$value = this.rank$value;
|
||||
if (!this.rank$set) {
|
||||
rank$value = SpecificationLimit.none();
|
||||
}
|
||||
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -9,8 +7,6 @@ import org.jetbrains.annotations.NotNull;
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
|
||||
@Getter
|
||||
@ToString
|
||||
public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResultItem> {
|
||||
public final SearchResultItem rawIndexResult;
|
||||
|
||||
@ -24,7 +20,9 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
@NotNull
|
||||
public final String format;
|
||||
|
||||
/** Document features bitmask, see HtmlFeature */
|
||||
/**
|
||||
* Document features bitmask, see HtmlFeature
|
||||
*/
|
||||
public final int features;
|
||||
|
||||
@Nullable
|
||||
@ -42,6 +40,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
public long documentId() {
|
||||
return rawIndexResult.getDocumentId();
|
||||
}
|
||||
|
||||
public int domainId() {
|
||||
return rawIndexResult.getDomainId();
|
||||
}
|
||||
@ -74,8 +73,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
int resultsFromDomain,
|
||||
@Nullable
|
||||
ResultRankingDetails rankingDetails
|
||||
)
|
||||
{
|
||||
) {
|
||||
this.rawIndexResult = rawIndexResult;
|
||||
this.url = url;
|
||||
this.title = title;
|
||||
@ -94,11 +92,73 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull DecoratedSearchResultItem o) {
|
||||
int diff = Double.compare(rankingScore, o.rankingScore);
|
||||
int diff = Double.compare(rankingScore, o.rankingScore);
|
||||
|
||||
if (diff == 0)
|
||||
diff = Long.compare(documentId(), o.documentId());
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
public SearchResultItem getRawIndexResult() {
|
||||
return this.rawIndexResult;
|
||||
}
|
||||
|
||||
public @NotNull EdgeUrl getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
public @NotNull String getTitle() {
|
||||
return this.title;
|
||||
}
|
||||
|
||||
public @NotNull String getDescription() {
|
||||
return this.description;
|
||||
}
|
||||
|
||||
public double getUrlQuality() {
|
||||
return this.urlQuality;
|
||||
}
|
||||
|
||||
public @NotNull String getFormat() {
|
||||
return this.format;
|
||||
}
|
||||
|
||||
public int getFeatures() {
|
||||
return this.features;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public Integer getPubYear() {
|
||||
return this.pubYear;
|
||||
}
|
||||
|
||||
public long getDataHash() {
|
||||
return this.dataHash;
|
||||
}
|
||||
|
||||
public int getWordsTotal() {
|
||||
return this.wordsTotal;
|
||||
}
|
||||
|
||||
public long getBestPositions() {
|
||||
return this.bestPositions;
|
||||
}
|
||||
|
||||
public double getRankingScore() {
|
||||
return this.rankingScore;
|
||||
}
|
||||
|
||||
public int getResultsFromDomain() {
|
||||
return this.resultsFromDomain;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public ResultRankingDetails getRankingDetails() {
|
||||
return this.rankingDetails;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,9 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
@ToString
|
||||
public class ResultRankingContext {
|
||||
private final int docCount;
|
||||
public final ResultRankingParameters params;
|
||||
@ -43,4 +41,15 @@ public class ResultRankingContext {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ResultRankingContext{" +
|
||||
"docCount=" + docCount +
|
||||
", params=" + params +
|
||||
", regularMask=" + regularMask +
|
||||
", ngramsMask=" + ngramsMask +
|
||||
", fullCounts=" + fullCounts +
|
||||
", priorityCounts=" + priorityCounts +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
@ -1,33 +1,38 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
@EqualsAndHashCode
|
||||
@Getter // getter for the mustache template engine's behalf
|
||||
public class ResultRankingParameters {
|
||||
|
||||
/** Tuning for BM25 when applied to full document matches */
|
||||
/**
|
||||
* Tuning for BM25 when applied to full document matches
|
||||
*/
|
||||
public final Bm25Parameters bm25Params;
|
||||
|
||||
/** Documents below this length are penalized */
|
||||
/**
|
||||
* Documents below this length are penalized
|
||||
*/
|
||||
public int shortDocumentThreshold;
|
||||
|
||||
public double shortDocumentPenalty;
|
||||
|
||||
|
||||
/** Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good) */
|
||||
/**
|
||||
* Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good)
|
||||
*/
|
||||
public double domainRankBonus;
|
||||
|
||||
/** Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad) */
|
||||
/**
|
||||
* Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad)
|
||||
*/
|
||||
public double qualityPenalty;
|
||||
|
||||
/** Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want */
|
||||
/**
|
||||
* Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want
|
||||
*/
|
||||
public int shortSentenceThreshold;
|
||||
|
||||
/** Magnitude of penalty for documents with low average sentence length */
|
||||
/**
|
||||
* Magnitude of penalty for documents with low average sentence length
|
||||
*/
|
||||
public double shortSentencePenalty;
|
||||
|
||||
public double bm25Weight;
|
||||
@ -40,13 +45,30 @@ public class ResultRankingParameters {
|
||||
|
||||
public boolean exportDebugData;
|
||||
|
||||
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
|
||||
this.bm25Params = bm25Params;
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
this.domainRankBonus = domainRankBonus;
|
||||
this.qualityPenalty = qualityPenalty;
|
||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
||||
this.shortSentencePenalty = shortSentencePenalty;
|
||||
this.bm25Weight = bm25Weight;
|
||||
this.tcfFirstPosition = tcfFirstPosition;
|
||||
this.tcfVerbatim = tcfVerbatim;
|
||||
this.tcfProximity = tcfProximity;
|
||||
this.temporalBias = temporalBias;
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
this.exportDebugData = exportDebugData;
|
||||
}
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return builder()
|
||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||
.shortDocumentThreshold(2000)
|
||||
.shortDocumentPenalty(2.)
|
||||
.domainRankBonus(1/25.)
|
||||
.qualityPenalty(1/15.)
|
||||
.domainRankBonus(1 / 25.)
|
||||
.qualityPenalty(1 / 15.)
|
||||
.shortSentenceThreshold(2)
|
||||
.shortSentencePenalty(5)
|
||||
.bm25Weight(1.)
|
||||
@ -59,7 +81,232 @@ public class ResultRankingParameters {
|
||||
.build();
|
||||
}
|
||||
|
||||
public static ResultRankingParametersBuilder builder() {
|
||||
return new ResultRankingParametersBuilder();
|
||||
}
|
||||
|
||||
public Bm25Parameters getBm25Params() {
|
||||
return this.bm25Params;
|
||||
}
|
||||
|
||||
public int getShortDocumentThreshold() {
|
||||
return this.shortDocumentThreshold;
|
||||
}
|
||||
|
||||
public double getShortDocumentPenalty() {
|
||||
return this.shortDocumentPenalty;
|
||||
}
|
||||
|
||||
public double getDomainRankBonus() {
|
||||
return this.domainRankBonus;
|
||||
}
|
||||
|
||||
public double getQualityPenalty() {
|
||||
return this.qualityPenalty;
|
||||
}
|
||||
|
||||
public int getShortSentenceThreshold() {
|
||||
return this.shortSentenceThreshold;
|
||||
}
|
||||
|
||||
public double getShortSentencePenalty() {
|
||||
return this.shortSentencePenalty;
|
||||
}
|
||||
|
||||
public double getBm25Weight() {
|
||||
return this.bm25Weight;
|
||||
}
|
||||
|
||||
public double getTcfFirstPosition() {
|
||||
return this.tcfFirstPosition;
|
||||
}
|
||||
|
||||
public double getTcfVerbatim() {
|
||||
return this.tcfVerbatim;
|
||||
}
|
||||
|
||||
public double getTcfProximity() {
|
||||
return this.tcfProximity;
|
||||
}
|
||||
|
||||
public TemporalBias getTemporalBias() {
|
||||
return this.temporalBias;
|
||||
}
|
||||
|
||||
public double getTemporalBiasWeight() {
|
||||
return this.temporalBiasWeight;
|
||||
}
|
||||
|
||||
public boolean isExportDebugData() {
|
||||
return this.exportDebugData;
|
||||
}
|
||||
|
||||
public boolean equals(final Object o) {
|
||||
if (o == this) return true;
|
||||
if (!(o instanceof ResultRankingParameters)) return false;
|
||||
final ResultRankingParameters other = (ResultRankingParameters) o;
|
||||
if (!other.canEqual((Object) this)) return false;
|
||||
final Object this$bm25Params = this.getBm25Params();
|
||||
final Object other$bm25Params = other.getBm25Params();
|
||||
if (this$bm25Params == null ? other$bm25Params != null : !this$bm25Params.equals(other$bm25Params))
|
||||
return false;
|
||||
if (this.getShortDocumentThreshold() != other.getShortDocumentThreshold()) return false;
|
||||
if (Double.compare(this.getShortDocumentPenalty(), other.getShortDocumentPenalty()) != 0) return false;
|
||||
if (Double.compare(this.getDomainRankBonus(), other.getDomainRankBonus()) != 0) return false;
|
||||
if (Double.compare(this.getQualityPenalty(), other.getQualityPenalty()) != 0) return false;
|
||||
if (this.getShortSentenceThreshold() != other.getShortSentenceThreshold()) return false;
|
||||
if (Double.compare(this.getShortSentencePenalty(), other.getShortSentencePenalty()) != 0) return false;
|
||||
if (Double.compare(this.getBm25Weight(), other.getBm25Weight()) != 0) return false;
|
||||
if (Double.compare(this.getTcfFirstPosition(), other.getTcfFirstPosition()) != 0) return false;
|
||||
if (Double.compare(this.getTcfVerbatim(), other.getTcfVerbatim()) != 0) return false;
|
||||
if (Double.compare(this.getTcfProximity(), other.getTcfProximity()) != 0) return false;
|
||||
final Object this$temporalBias = this.getTemporalBias();
|
||||
final Object other$temporalBias = other.getTemporalBias();
|
||||
if (this$temporalBias == null ? other$temporalBias != null : !this$temporalBias.equals(other$temporalBias))
|
||||
return false;
|
||||
if (Double.compare(this.getTemporalBiasWeight(), other.getTemporalBiasWeight()) != 0) return false;
|
||||
if (this.isExportDebugData() != other.isExportDebugData()) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean canEqual(final Object other) {
|
||||
return other instanceof ResultRankingParameters;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
final int PRIME = 59;
|
||||
int result = 1;
|
||||
final Object $bm25Params = this.getBm25Params();
|
||||
result = result * PRIME + ($bm25Params == null ? 43 : $bm25Params.hashCode());
|
||||
result = result * PRIME + this.getShortDocumentThreshold();
|
||||
final long $shortDocumentPenalty = Double.doubleToLongBits(this.getShortDocumentPenalty());
|
||||
result = result * PRIME + (int) ($shortDocumentPenalty >>> 32 ^ $shortDocumentPenalty);
|
||||
final long $domainRankBonus = Double.doubleToLongBits(this.getDomainRankBonus());
|
||||
result = result * PRIME + (int) ($domainRankBonus >>> 32 ^ $domainRankBonus);
|
||||
final long $qualityPenalty = Double.doubleToLongBits(this.getQualityPenalty());
|
||||
result = result * PRIME + (int) ($qualityPenalty >>> 32 ^ $qualityPenalty);
|
||||
result = result * PRIME + this.getShortSentenceThreshold();
|
||||
final long $shortSentencePenalty = Double.doubleToLongBits(this.getShortSentencePenalty());
|
||||
result = result * PRIME + (int) ($shortSentencePenalty >>> 32 ^ $shortSentencePenalty);
|
||||
final long $bm25Weight = Double.doubleToLongBits(this.getBm25Weight());
|
||||
result = result * PRIME + (int) ($bm25Weight >>> 32 ^ $bm25Weight);
|
||||
final long $tcfFirstPosition = Double.doubleToLongBits(this.getTcfFirstPosition());
|
||||
result = result * PRIME + (int) ($tcfFirstPosition >>> 32 ^ $tcfFirstPosition);
|
||||
final long $tcfVerbatim = Double.doubleToLongBits(this.getTcfVerbatim());
|
||||
result = result * PRIME + (int) ($tcfVerbatim >>> 32 ^ $tcfVerbatim);
|
||||
final long $tcfProximity = Double.doubleToLongBits(this.getTcfProximity());
|
||||
result = result * PRIME + (int) ($tcfProximity >>> 32 ^ $tcfProximity);
|
||||
final Object $temporalBias = this.getTemporalBias();
|
||||
result = result * PRIME + ($temporalBias == null ? 43 : $temporalBias.hashCode());
|
||||
final long $temporalBiasWeight = Double.doubleToLongBits(this.getTemporalBiasWeight());
|
||||
result = result * PRIME + (int) ($temporalBiasWeight >>> 32 ^ $temporalBiasWeight);
|
||||
result = result * PRIME + (this.isExportDebugData() ? 79 : 97);
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters(bm25Params=" + this.getBm25Params() + ", shortDocumentThreshold=" + this.getShortDocumentThreshold() + ", shortDocumentPenalty=" + this.getShortDocumentPenalty() + ", domainRankBonus=" + this.getDomainRankBonus() + ", qualityPenalty=" + this.getQualityPenalty() + ", shortSentenceThreshold=" + this.getShortSentenceThreshold() + ", shortSentencePenalty=" + this.getShortSentencePenalty() + ", bm25Weight=" + this.getBm25Weight() + ", tcfFirstPosition=" + this.getTcfFirstPosition() + ", tcfVerbatim=" + this.getTcfVerbatim() + ", tcfProximity=" + this.getTcfProximity() + ", temporalBias=" + this.getTemporalBias() + ", temporalBiasWeight=" + this.getTemporalBiasWeight() + ", exportDebugData=" + this.isExportDebugData() + ")";
|
||||
}
|
||||
|
||||
public enum TemporalBias {
|
||||
RECENT, OLD, NONE
|
||||
}
|
||||
|
||||
public static class ResultRankingParametersBuilder {
|
||||
private Bm25Parameters bm25Params;
|
||||
private int shortDocumentThreshold;
|
||||
private double shortDocumentPenalty;
|
||||
private double domainRankBonus;
|
||||
private double qualityPenalty;
|
||||
private int shortSentenceThreshold;
|
||||
private double shortSentencePenalty;
|
||||
private double bm25Weight;
|
||||
private double tcfFirstPosition;
|
||||
private double tcfVerbatim;
|
||||
private double tcfProximity;
|
||||
private TemporalBias temporalBias;
|
||||
private double temporalBiasWeight;
|
||||
private boolean exportDebugData;
|
||||
|
||||
ResultRankingParametersBuilder() {
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder bm25Params(Bm25Parameters bm25Params) {
|
||||
this.bm25Params = bm25Params;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortDocumentThreshold(int shortDocumentThreshold) {
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortDocumentPenalty(double shortDocumentPenalty) {
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder domainRankBonus(double domainRankBonus) {
|
||||
this.domainRankBonus = domainRankBonus;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder qualityPenalty(double qualityPenalty) {
|
||||
this.qualityPenalty = qualityPenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortSentenceThreshold(int shortSentenceThreshold) {
|
||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortSentencePenalty(double shortSentencePenalty) {
|
||||
this.shortSentencePenalty = shortSentencePenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder bm25Weight(double bm25Weight) {
|
||||
this.bm25Weight = bm25Weight;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfFirstPosition(double tcfFirstPosition) {
|
||||
this.tcfFirstPosition = tcfFirstPosition;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfVerbatim(double tcfVerbatim) {
|
||||
this.tcfVerbatim = tcfVerbatim;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfProximity(double tcfProximity) {
|
||||
this.tcfProximity = tcfProximity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder temporalBias(TemporalBias temporalBias) {
|
||||
this.temporalBias = temporalBias;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder temporalBiasWeight(double temporalBiasWeight) {
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
|
||||
this.exportDebugData = exportDebugData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParameters build() {
|
||||
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -9,21 +7,30 @@ import org.jetbrains.annotations.NotNull;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Represents a document matching a search query */
|
||||
@AllArgsConstructor @Getter
|
||||
/**
|
||||
* Represents a document matching a search query
|
||||
*/
|
||||
public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
/** Encoded ID that contains both the URL id and its ranking. This is
|
||||
* probably not what you want, use getDocumentId() instead */
|
||||
/**
|
||||
* Encoded ID that contains both the URL id and its ranking. This is
|
||||
* probably not what you want, use getDocumentId() instead
|
||||
*/
|
||||
public final long combinedId;
|
||||
|
||||
/** Encoded document metadata */
|
||||
/**
|
||||
* Encoded document metadata
|
||||
*/
|
||||
public final long encodedDocMetadata;
|
||||
|
||||
/** Encoded html features of document */
|
||||
/**
|
||||
* Encoded html features of document
|
||||
*/
|
||||
|
||||
public final int htmlFeatures;
|
||||
|
||||
/** How did the subqueries match against the document ? */
|
||||
/**
|
||||
* How did the subqueries match against the document ?
|
||||
*/
|
||||
public final List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
public boolean hasPrioTerm;
|
||||
@ -45,6 +52,17 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
this.scoreValue = score;
|
||||
}
|
||||
|
||||
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures, List<SearchResultKeywordScore> keywordScores, boolean hasPrioTerm, long bestPositions, DebugRankingFactors debugRankingFactors, double scoreValue) {
|
||||
this.combinedId = combinedId;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.keywordScores = keywordScores;
|
||||
this.hasPrioTerm = hasPrioTerm;
|
||||
this.bestPositions = bestPositions;
|
||||
this.debugRankingFactors = debugRankingFactors;
|
||||
this.scoreValue = scoreValue;
|
||||
}
|
||||
|
||||
|
||||
public long getDocumentId() {
|
||||
return UrlIdCodec.removeRank(combinedId);
|
||||
@ -56,9 +74,11 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
/* Used for evaluation */
|
||||
private transient double scoreValue = Double.MAX_VALUE;
|
||||
|
||||
public void setScore(double score) {
|
||||
scoreValue = score;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return scoreValue;
|
||||
}
|
||||
@ -81,7 +101,7 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
if (other == this)
|
||||
return true;
|
||||
if (other instanceof SearchResultItem o) {
|
||||
return o.getDocumentId() == getDocumentId();
|
||||
return o.getDocumentId() == getDocumentId();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -96,4 +116,35 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
}
|
||||
|
||||
|
||||
public long getCombinedId() {
|
||||
return this.combinedId;
|
||||
}
|
||||
|
||||
public long getEncodedDocMetadata() {
|
||||
return this.encodedDocMetadata;
|
||||
}
|
||||
|
||||
public int getHtmlFeatures() {
|
||||
return this.htmlFeatures;
|
||||
}
|
||||
|
||||
public List<SearchResultKeywordScore> getKeywordScores() {
|
||||
return this.keywordScores;
|
||||
}
|
||||
|
||||
public boolean isHasPrioTerm() {
|
||||
return this.hasPrioTerm;
|
||||
}
|
||||
|
||||
public long getBestPositions() {
|
||||
return this.bestPositions;
|
||||
}
|
||||
|
||||
public DebugRankingFactors getDebugRankingFactors() {
|
||||
return this.debugRankingFactors;
|
||||
}
|
||||
|
||||
public double getScoreValue() {
|
||||
return this.scoreValue;
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.api;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
@ -51,7 +50,6 @@ public class IndexClient {
|
||||
) {}
|
||||
|
||||
/** Execute a query on the index partitions and return the combined results. */
|
||||
@SneakyThrows
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
@ -40,9 +39,9 @@ class ForwardIndexConverterTest {
|
||||
private Path docsSpanData;
|
||||
|
||||
int workSetSize = 512;
|
||||
|
||||
@BeforeEach
|
||||
@SneakyThrows
|
||||
void setUp() {
|
||||
void setUp() throws Exception {
|
||||
|
||||
workDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
|
||||
@ -75,7 +74,7 @@ class ForwardIndexConverterTest {
|
||||
return UrlIdCodec.encodeId((int) domain, (int) url);
|
||||
}
|
||||
|
||||
public void createEntry(IndexJournalSlopWriter writer, int id) {
|
||||
public void createEntry(IndexJournalSlopWriter writer, int id) throws IOException {
|
||||
writer.put(
|
||||
createId(id, id/20),
|
||||
new SlopDocumentRecord.KeywordsProjection(
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
@ -53,8 +52,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
spansWriter = IndexJournalPage.spans.create(this);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) {
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
|
||||
|
||||
combinedIdWriter.put(combinedId);
|
||||
featuresWriter.put(keywordsProjection.htmlFeatures());
|
||||
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
@ -62,7 +61,6 @@ public class PrioIndexEntrySource implements EntrySource {
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
@SuppressWarnings("preview")
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
@ -78,35 +77,48 @@ public class FullIndexConstructor {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) {
|
||||
return FullPreindex
|
||||
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
|
||||
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return FullPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
return FullPreindex
|
||||
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
catch (IOException e) {
|
||||
logger.error("Error constructing preindex", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
|
||||
try {
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return FullPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
} finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error merging preindex", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void finalizeIndex(FullPreindexReference finalPR) {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
try {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error finalizing index", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
@ -113,7 +112,6 @@ public class FullPreindexDocuments {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
@ -73,35 +72,47 @@ public class PrioIndexConstructor {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private PrioPreindexReference construct(IndexJournalPage journalInstance) {
|
||||
return PrioPreindex
|
||||
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
|
||||
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
return PrioPreindex
|
||||
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to construct preindex", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
|
||||
try {
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
} finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to merge preindex", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void finalizeIndex(PrioPreindexReference finalPR) {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
try {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to finalize preindex", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
@ -97,7 +96,6 @@ public class PrioPreindexDocuments {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
@ -7,7 +7,6 @@ import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
@ -109,7 +108,6 @@ public class IndexGrpcService
|
||||
}
|
||||
|
||||
// GRPC endpoint
|
||||
@SneakyThrows
|
||||
public void query(RpcIndexQuery request,
|
||||
StreamObserver<RpcDecoratedResultItem> responseObserver) {
|
||||
|
||||
@ -157,9 +155,14 @@ public class IndexGrpcService
|
||||
|
||||
|
||||
// exists for test access
|
||||
@SneakyThrows
|
||||
List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
|
||||
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
|
||||
try {
|
||||
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
private SearchSet getSearchSet(SearchSpecification specsSet) {
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
@ -20,27 +19,32 @@ public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||
super(dataSource);
|
||||
this.graphClient = graphClient;
|
||||
}
|
||||
@SneakyThrows
|
||||
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
try {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Invert the edge
|
||||
graph.addEdge(iter.dest(), iter.source());
|
||||
}
|
||||
|
||||
// Invert the edge
|
||||
graph.addEdge(iter.dest(), iter.source());
|
||||
return graph;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
@ -18,26 +17,31 @@ public class LinkGraphSource extends AbstractGraphSource {
|
||||
this.graphClient = graphClient;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
try {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
graph.addEdge(iter.source(), iter.dest());
|
||||
}
|
||||
|
||||
graph.addEdge(iter.source(), iter.dest());
|
||||
return graph;
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
@ -35,14 +34,13 @@ public class SimilarityGraphSource extends AbstractGraphSource {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
addVertices(graph);
|
||||
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
|
||||
FROM EC_DOMAIN_NEIGHBORS_2
|
||||
@ -67,6 +65,9 @@ public class SimilarityGraphSource extends AbstractGraphSource {
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
@ -311,7 +311,9 @@ public class CombinedIndexReaderTest {
|
||||
}
|
||||
|
||||
void load() throws IOException, SQLException, URISyntaxException {
|
||||
allData.forEach((doc, words) -> {
|
||||
for (Map.Entry<Long, List<MockDataKeyword>> entry : allData.entrySet()) {
|
||||
final Long doc = entry.getKey();
|
||||
final List<MockDataKeyword> words = entry.getValue();
|
||||
|
||||
var meta = metaByDoc.get(doc);
|
||||
|
||||
@ -320,7 +322,7 @@ public class CombinedIndexReaderTest {
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
metadata[i] = words.get(i).termMetadata;
|
||||
}
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
|
||||
|
||||
indexJournalWriter.put(doc,
|
||||
new SlopDocumentRecord.KeywordsProjection(
|
||||
@ -335,7 +337,7 @@ public class CombinedIndexReaderTest {
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
@ -378,8 +377,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadData(DocumentDbWriter ldbw, int id) {
|
||||
public void loadData(DocumentDbWriter ldbw, int id) throws Exception {
|
||||
int[] factors = IntStream
|
||||
.rangeClosed(1, id)
|
||||
.filter(v -> (id % v) == 0)
|
||||
@ -423,8 +421,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
|
||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) throws Exception {
|
||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||
long fullId = UrlIdCodec.encodeId(domain, id);
|
||||
|
||||
|
@ -532,8 +532,9 @@ public class IndexQueryServiceIntegrationTest {
|
||||
}
|
||||
|
||||
void load() throws IOException, SQLException, URISyntaxException {
|
||||
allData.forEach((doc, words) -> {
|
||||
|
||||
for (Map.Entry<Long, List<MockDataKeyword>> entry : allData.entrySet()) {
|
||||
Long doc = entry.getKey();
|
||||
List<MockDataKeyword> words = entry.getValue();
|
||||
var meta = metaByDoc.get(doc);
|
||||
|
||||
List<String> keywords = words.stream().map(w -> w.keyword).toList();
|
||||
@ -561,7 +562,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@ -43,14 +42,14 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
idToName = new HashMap<>();
|
||||
|
||||
try (var stream = Files
|
||||
.lines(domainDataPath)) {
|
||||
.lines(domainDataPath))
|
||||
{
|
||||
|
||||
stream.skip(1)
|
||||
.mapMultiToInt((line, c) -> {
|
||||
@ -65,6 +64,9 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
for (var path : linksDataPaths) {
|
||||
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||
@ -80,8 +82,12 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@ -44,7 +43,6 @@ public class TestGraphSourceForLinkData implements GraphSource {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
@ -66,6 +64,9 @@ public class TestGraphSourceForLinkData implements GraphSource {
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
for (var path : linksDataPaths) {
|
||||
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||
@ -81,8 +82,12 @@ public class TestGraphSourceForLinkData implements GraphSource {
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
@ -33,7 +33,6 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||
@ -55,6 +54,9 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
try (var stream = Files
|
||||
.lines(similarityDataPath)) {
|
||||
@ -71,6 +73,9 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@ -23,12 +23,16 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
|
||||
private T next = null;
|
||||
|
||||
@SneakyThrows
|
||||
ProcessingIterator(SimpleBlockingThreadPool pool, int queueSize, ProcessingJob<T> task) {
|
||||
queue = new LinkedBlockingQueue<>(queueSize);
|
||||
this.pool = pool;
|
||||
|
||||
pool.submit(() -> executeJob(task));
|
||||
try {
|
||||
pool.submit(() -> executeJob(task));
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
}
|
||||
|
||||
public static Factory factory(int queueSize, int parallelism) {
|
||||
@ -45,15 +49,19 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void executeTask(Task<T> task) {
|
||||
pool.submit(() -> {
|
||||
try {
|
||||
queue.put(task.get());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
});
|
||||
try {
|
||||
pool.submit(() -> {
|
||||
try {
|
||||
queue.put(task.get());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if there are more documents to be processed.
|
||||
@ -63,17 +71,21 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
* (or synchronize between the two)
|
||||
*/
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public boolean hasNext() {
|
||||
if (next != null)
|
||||
return true;
|
||||
|
||||
do {
|
||||
next = queue.poll(50, TimeUnit.MILLISECONDS);
|
||||
if (next != null) {
|
||||
return true;
|
||||
}
|
||||
} while (expectMore());
|
||||
try {
|
||||
do {
|
||||
next = queue.poll(50, TimeUnit.MILLISECONDS);
|
||||
if (next != null) {
|
||||
return true;
|
||||
}
|
||||
} while (expectMore());
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@ -96,7 +108,6 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
* <p>
|
||||
* If this is run after hasNext() returns false, a NoSuchElementException is thrown.
|
||||
*/
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public T next() {
|
||||
if (!hasNext()) {
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.language.filter;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -45,7 +44,6 @@ public class LanguageFilter {
|
||||
}
|
||||
|
||||
@Inject
|
||||
@SneakyThrows
|
||||
public LanguageFilter(LanguageModels lm) {
|
||||
try {
|
||||
languagePredictionModel1 = new UngaBungaLanguagePredictionModel();
|
||||
|
@ -1,12 +1,9 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
public class WordRep implements Comparable<WordRep> {
|
||||
|
||||
public WordRep(DocumentSentence sent, WordSpan span) {
|
||||
@ -22,6 +19,13 @@ public class WordRep implements Comparable<WordRep> {
|
||||
public final String stemmed;
|
||||
private final int hashCode;
|
||||
|
||||
public WordRep(int length, String word, String stemmed, int hashCode) {
|
||||
this.length = length;
|
||||
this.word = word;
|
||||
this.stemmed = stemmed;
|
||||
this.hashCode = hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull WordRep o) {
|
||||
return word.compareTo(o.word);
|
||||
@ -43,4 +47,20 @@ public class WordRep implements Comparable<WordRep> {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return this.length;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return this.word;
|
||||
}
|
||||
|
||||
public String getStemmed() {
|
||||
return this.stemmed;
|
||||
}
|
||||
|
||||
public int getHashCode() {
|
||||
return this.hashCode;
|
||||
}
|
||||
}
|
||||
|
@ -1,17 +1,20 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@AllArgsConstructor @EqualsAndHashCode
|
||||
public class WordSpan implements Comparable<WordSpan>{
|
||||
public class WordSpan implements Comparable<WordSpan> {
|
||||
public final int start;
|
||||
public final int end;
|
||||
|
||||
public WordSpan(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return end - start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull WordSpan o) {
|
||||
return start - o.start;
|
||||
@ -30,8 +33,7 @@ public class WordSpan implements Comparable<WordSpan>{
|
||||
}
|
||||
if (start < other.start) {
|
||||
return end - other.start;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return other.end - start;
|
||||
}
|
||||
|
||||
@ -40,4 +42,26 @@ public class WordSpan implements Comparable<WordSpan>{
|
||||
public String toString() {
|
||||
return String.format("WordSpan[%s,%s]", start, end);
|
||||
}
|
||||
|
||||
public boolean equals(final Object o) {
|
||||
if (o == this) return true;
|
||||
if (!(o instanceof WordSpan)) return false;
|
||||
final WordSpan other = (WordSpan) o;
|
||||
if (!other.canEqual((Object) this)) return false;
|
||||
if (this.start != other.start) return false;
|
||||
if (this.end != other.end) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean canEqual(final Object other) {
|
||||
return other instanceof WordSpan;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
final int PRIME = 59;
|
||||
int result = 1;
|
||||
result = result * PRIME + this.start;
|
||||
result = result * PRIME + this.end;
|
||||
return result;
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
@ -46,7 +45,7 @@ public class SentenceExtractor {
|
||||
static final int MAX_SENTENCE_LENGTH = 250;
|
||||
static final int MAX_SENTENCE_COUNT = 1000;
|
||||
|
||||
@SneakyThrows @Inject
|
||||
@Inject
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
{
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.actor;
|
||||
|
||||
import nu.marginalia.actor.prototype.ActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.state.ActorStateTransition;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
@ -8,7 +11,6 @@ import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSubscription;
|
||||
import nu.marginalia.mq.inbox.MqSynchronousInbox;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.actor.state.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -305,33 +307,38 @@ public class ActorStateMachine {
|
||||
return state;
|
||||
}
|
||||
|
||||
public void abortExecution() throws Exception {
|
||||
// Create a fake message to abort the execution
|
||||
// This helps make sense of the queue when debugging
|
||||
// and also permits the real termination message to have an
|
||||
// unique expected ID
|
||||
public void abortExecution() {
|
||||
try {
|
||||
// Create a fake message to abort the execution
|
||||
// This helps make sense of the queue when debugging
|
||||
// and also permits the real termination message to have an
|
||||
// unique expected ID
|
||||
|
||||
long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution");
|
||||
long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution");
|
||||
|
||||
// Set it as dead to clean up the queue from mystery ACK messages
|
||||
smOutbox.flagAsDead(abortMsgId);
|
||||
// Set it as dead to clean up the queue from mystery ACK messages
|
||||
smOutbox.flagAsDead(abortMsgId);
|
||||
|
||||
// Set the expected message to the abort message,
|
||||
// technically there's a slight chance of a race condition here,
|
||||
// which will cause this message to be ERR'd and the process to
|
||||
// continue, but it's very unlikely and the worst that can happen
|
||||
// is you have to abort twice.
|
||||
// Set the expected message to the abort message,
|
||||
// technically there's a slight chance of a race condition here,
|
||||
// which will cause this message to be ERR'd and the process to
|
||||
// continue, but it's very unlikely and the worst that can happen
|
||||
// is you have to abort twice.
|
||||
|
||||
expectedMessage = ExpectedMessage.expectId(abortMsgId);
|
||||
expectedMessage = ExpectedMessage.expectId(abortMsgId);
|
||||
|
||||
// Add a state transition to the monitor state, causing it to reset the state machine to the initial state
|
||||
// (or if no monitor state is defined, set it to the final state)
|
||||
smOutbox.sendNotice(abortMsgId, finalState.name(), "");
|
||||
// Add a state transition to the monitor state, causing it to reset the state machine to the initial state
|
||||
// (or if no monitor state is defined, set it to the final state)
|
||||
smOutbox.sendNotice(abortMsgId, finalState.name(), "");
|
||||
|
||||
// Dislodge the current task with an interrupt.
|
||||
// It's actually fine if we accidentally interrupt the wrong thread
|
||||
// (i.e. the abort task), since it shouldn't be doing anything interruptable
|
||||
smInbox.abortCurrentTask();
|
||||
// Dislodge the current task with an interrupt.
|
||||
// It's actually fine if we accidentally interrupt the wrong thread
|
||||
// (i.e. the abort task), since it shouldn't be doing anything interruptable
|
||||
smInbox.abortCurrentTask();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to abort execution", e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if there is an INITIAL state that requires no parameters */
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.mq.inbox;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
@ -67,8 +66,7 @@ public class MqSingleShotInbox {
|
||||
* @param predicate A predicate that must be true for the message to be stolen
|
||||
* @return The stolen message, or empty if no message was stolen
|
||||
*/
|
||||
@SneakyThrows
|
||||
public Optional<MqMessage> stealMessage(Predicate<MqMessage> predicate) {
|
||||
public Optional<MqMessage> stealMessage(Predicate<MqMessage> predicate) throws SQLException {
|
||||
for (var message : persistence.eavesdrop(inboxName, 5)) {
|
||||
if (predicate.test(message)) {
|
||||
persistence.changeOwner(message.msgId(), instanceUUID, -1);
|
||||
|
@ -1,8 +1,9 @@
|
||||
package nu.marginalia.term_frequency_dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
@ -11,9 +12,7 @@ import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@ -29,13 +28,11 @@ public class TermFrequencyDict {
|
||||
public static final long DOC_COUNT_KEY = ~0L;
|
||||
|
||||
@Inject
|
||||
public TermFrequencyDict(@NotNull LanguageModels models) {
|
||||
public TermFrequencyDict(@NotNull LanguageModels models) throws IOException {
|
||||
this(models.termFrequencies);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TermFrequencyDict(Path file) {
|
||||
|
||||
public TermFrequencyDict(Path file) throws IOException {
|
||||
wordRates = load(file);
|
||||
logger.info("Read {} N-grams frequencies", wordRates.size());
|
||||
}
|
||||
|
@ -31,8 +31,13 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
// for tests
|
||||
public DocumentKeywordExtractor() {
|
||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
try {
|
||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword.extractors.NameLikeKeywords;
|
||||
import nu.marginalia.keyword.extractors.SubjectLikeKeywords;
|
||||
import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
@ -14,19 +13,21 @@ class KeywordMetadata {
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
private final UrlKeywords urlKeywords;
|
||||
|
||||
@Builder
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords)
|
||||
{
|
||||
UrlKeywords urlKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
this.urlKeywords = urlKeywords;
|
||||
}
|
||||
|
||||
public static KeywordMetadataBuilder builder() {
|
||||
return new KeywordMetadataBuilder();
|
||||
}
|
||||
|
||||
public byte getMetadataForWord(String stemmed) {
|
||||
|
||||
byte flags = 0;
|
||||
@ -54,4 +55,41 @@ class KeywordMetadata {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public static class KeywordMetadataBuilder {
|
||||
private TitleKeywords titleKeywords;
|
||||
private NameLikeKeywords nameLikeKeywords;
|
||||
private SubjectLikeKeywords subjectLikeKeywords;
|
||||
private UrlKeywords urlKeywords;
|
||||
|
||||
KeywordMetadataBuilder() {
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder titleKeywords(TitleKeywords titleKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder nameLikeKeywords(NameLikeKeywords nameLikeKeywords) {
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder subjectLikeKeywords(SubjectLikeKeywords subjectLikeKeywords) {
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder urlKeywords(UrlKeywords urlKeywords) {
|
||||
this.urlKeywords = urlKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadata build() {
|
||||
return new KeywordMetadata(this.titleKeywords, this.nameLikeKeywords, this.subjectLikeKeywords, this.urlKeywords);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "KeywordMetadata.KeywordMetadataBuilder(titleKeywords=" + this.titleKeywords + ", nameLikeKeywords=" + this.nameLikeKeywords + ", subjectLikeKeywords=" + this.subjectLikeKeywords + ", urlKeywords=" + this.urlKeywords + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ import gnu.trove.list.array.TByteArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
@ -15,13 +14,14 @@ import org.slf4j.LoggerFactory;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.*;
|
||||
|
||||
@Getter
|
||||
public class DocumentKeywordsBuilder {
|
||||
public final Object2ByteOpenHashMap<String> wordToMeta;
|
||||
public final HashMap<String, IntList> wordToPos;
|
||||
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
|
||||
|
||||
/** These ware keywords that had signals of high relevance */
|
||||
/**
|
||||
* These ware keywords that had signals of high relevance
|
||||
*/
|
||||
public final Set<String> importantWords = new HashSet<>();
|
||||
|
||||
// |------64 letters is this long-------------------------------|
|
||||
@ -64,7 +64,7 @@ public class DocumentKeywordsBuilder {
|
||||
wordSpans.forEach((tag, spansForTag) -> {
|
||||
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
|
||||
|
||||
var positionsForTag = new IntArrayList(spansForTag.size()*2);
|
||||
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
|
||||
for (var span : spansForTag) {
|
||||
positionsForTag.add(span.start());
|
||||
positionsForTag.add(span.end());
|
||||
@ -77,7 +77,7 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
@ -101,7 +101,7 @@ public class DocumentKeywordsBuilder {
|
||||
|
||||
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
|
||||
flagWords.forEach(word ->
|
||||
wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b))
|
||||
wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte) (a | b))
|
||||
);
|
||||
}
|
||||
|
||||
@ -116,7 +116,7 @@ public class DocumentKeywordsBuilder {
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) {
|
||||
for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext(); ) {
|
||||
var entry = iter.next();
|
||||
if ((flags & entry.getByteValue()) != 0) {
|
||||
ret.add(entry.getKey());
|
||||
@ -159,6 +159,30 @@ public class DocumentKeywordsBuilder {
|
||||
return sb.append(']').toString();
|
||||
}
|
||||
|
||||
public Object2ByteOpenHashMap<String> getWordToMeta() {
|
||||
return this.wordToMeta;
|
||||
}
|
||||
|
||||
public HashMap<String, IntList> getWordToPos() {
|
||||
return this.wordToPos;
|
||||
}
|
||||
|
||||
public Map<Character, List<DocumentWordSpan>> getWordSpans() {
|
||||
return this.wordSpans;
|
||||
}
|
||||
|
||||
public Set<String> getImportantWords() {
|
||||
return this.importantWords;
|
||||
}
|
||||
|
||||
public int getMAX_WORD_LENGTH() {
|
||||
return this.MAX_WORD_LENGTH;
|
||||
}
|
||||
|
||||
public int getMAX_POSITIONS_PER_WORD() {
|
||||
return this.MAX_POSITIONS_PER_WORD;
|
||||
}
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@ -13,6 +12,7 @@ import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
@ -25,8 +25,7 @@ class SentenceExtractorTest {
|
||||
|
||||
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
public static void main(String... args) throws IOException, URISyntaxException {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
@ -10,13 +9,14 @@ import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collections;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class NameLikeKeywordsTest {
|
||||
String text = """
|
||||
@ -58,8 +58,7 @@ class NameLikeKeywordsTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testWikiArticle() {
|
||||
public void testWikiArticle() throws IOException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@ -75,7 +74,6 @@ class NameLikeKeywordsTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testWikiArticleP1() {
|
||||
String html = """
|
||||
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for client–server web applications, with a reported 9 million developers.</p>
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@ -41,7 +42,7 @@ class SubjectLikeKeywordsTest {
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
public void test() throws IOException {
|
||||
var lm = TestLanguageModels.getLanguageModels();
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
|
@ -1,15 +1,12 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.OptionalDouble;
|
||||
|
||||
@ToString @Getter
|
||||
public class ProcessedDocument {
|
||||
public EdgeUrl url;
|
||||
|
||||
@ -41,4 +38,30 @@ public class ProcessedDocument {
|
||||
}
|
||||
return OptionalDouble.empty();
|
||||
}
|
||||
|
||||
public EdgeUrl getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public ProcessedDocumentDetails getDetails() {
|
||||
return this.details;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public DocumentKeywordsBuilder getWords() {
|
||||
return this.words;
|
||||
}
|
||||
|
||||
public UrlIndexingState getState() {
|
||||
return this.state;
|
||||
}
|
||||
|
||||
public String getStateReason() {
|
||||
return this.stateReason;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDocument(url=" + this.getUrl() + ", details=" + this.getDetails() + ", words=" + this.getWords() + ", state=" + this.getState() + ", stateReason=" + this.getStateReason() + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,16 +1,14 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@ToString
|
||||
public class ProcessedDocumentDetails {
|
||||
public String title;
|
||||
public String description;
|
||||
@ -31,4 +29,8 @@ public class ProcessedDocumentDetails {
|
||||
|
||||
public DocumentMetadata metadata;
|
||||
public GeneratorType generator;
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", feedLinks=" + this.feedLinks + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -11,8 +10,7 @@ import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
@ToString
|
||||
public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
public EdgeDomain domain;
|
||||
|
||||
public List<ProcessedDocument> documents;
|
||||
@ -21,8 +19,10 @@ public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
public String ip;
|
||||
|
||||
|
||||
/** Used by the sideloader to give advice on how many documents are crawled
|
||||
* without actually having to count (which would take forever) */
|
||||
/**
|
||||
* Used by the sideloader to give advice on how many documents are crawled
|
||||
* without actually having to count (which would take forever)
|
||||
*/
|
||||
@Nullable
|
||||
public Integer sizeloadSizeAdvice;
|
||||
|
||||
@ -41,5 +41,10 @@ public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
public void close() {
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDomain(domain=" + this.domain + ", documents=" + this.documents + ", state=" + this.state + ", redirect=" + this.redirect + ", ip=" + this.ip + ", sizeloadSizeAdvice=" + this.sizeloadSizeAdvice + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
@ -164,57 +163,62 @@ public class DomainProcessor {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Nullable
|
||||
public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) {
|
||||
if (!dataStream.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (!dataStream.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
Set<String> processedUrls = new HashSet<>();
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
Set<String> processedUrls = new HashSet<>();
|
||||
|
||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
|
||||
}
|
||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
|
||||
}
|
||||
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
|
||||
DocumentDecorator documentDecorator = new DocumentDecorator();
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
|
||||
DocumentDecorator documentDecorator = new DocumentDecorator();
|
||||
|
||||
// Process Domain Record
|
||||
// Process Domain Record
|
||||
|
||||
ProcessedDomain ret = new ProcessedDomain();
|
||||
processDomain(crawledDomain, ret, documentDecorator);
|
||||
ret.documents = docs;
|
||||
ProcessedDomain ret = new ProcessedDomain();
|
||||
processDomain(crawledDomain, ret, documentDecorator);
|
||||
ret.documents = docs;
|
||||
|
||||
// Process Documents
|
||||
// Process Documents
|
||||
|
||||
try (var deduplicator = new LshDocumentDeduplicator()) {
|
||||
while (dataStream.hasNext()) {
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (doc.url == null)
|
||||
continue;
|
||||
if (doc.documentBody.isBlank())
|
||||
continue;
|
||||
if (!processedUrls.add(doc.url))
|
||||
continue;
|
||||
try (var deduplicator = new LshDocumentDeduplicator()) {
|
||||
while (dataStream.hasNext()) {
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (doc.url == null)
|
||||
continue;
|
||||
if (doc.documentBody.isBlank())
|
||||
continue;
|
||||
if (!processedUrls.add(doc.url))
|
||||
continue;
|
||||
|
||||
try {
|
||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
docs.add(processedDoc);
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
try {
|
||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
docs.add(processedDoc);
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
calculateStatistics(ret, externalDomainLinks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to process domain", ex);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
calculateStatistics(ret, externalDomainLinks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void processDomain(CrawledDomain crawledDomain,
|
||||
|
@ -1,13 +1,11 @@
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
@Getter
|
||||
public class LinkGraph {
|
||||
private final Map<EdgeUrl, Set<EdgeUrl>> graph = new HashMap<>(1000);
|
||||
|
||||
|
@ -4,7 +4,6 @@ import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
@ -77,7 +76,6 @@ class JsonModel {
|
||||
List<JsonModelGraphItem> graph;
|
||||
}
|
||||
|
||||
@ToString
|
||||
class JsonModelGraphItem {
|
||||
@SerializedName("@type")
|
||||
public String type;
|
||||
@ -88,5 +86,9 @@ class JsonModelGraphItem {
|
||||
return "NewsArticle".equalsIgnoreCase(type)
|
||||
|| "Article".equalsIgnoreCase(type);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "JsonModelGraphItem(type=" + this.type + ", datePublished=" + this.datePublished + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,20 +1,62 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@Setter
|
||||
@Getter
|
||||
class DirtreeSideloadSpec {
|
||||
public String name;
|
||||
public String domainName;
|
||||
public String dir;
|
||||
public String baseUrl;
|
||||
public List<String> keywords;
|
||||
|
||||
public DirtreeSideloadSpec(String name, String domainName, String dir, String baseUrl, List<String> keywords) {
|
||||
this.name = name;
|
||||
this.domainName = domainName;
|
||||
this.dir = dir;
|
||||
this.baseUrl = baseUrl;
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
public DirtreeSideloadSpec() {
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getDomainName() {
|
||||
return this.domainName;
|
||||
}
|
||||
|
||||
public String getDir() {
|
||||
return this.dir;
|
||||
}
|
||||
|
||||
public String getBaseUrl() {
|
||||
return this.baseUrl;
|
||||
}
|
||||
|
||||
public List<String> getKeywords() {
|
||||
return this.keywords;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void setDomainName(String domainName) {
|
||||
this.domainName = domainName;
|
||||
}
|
||||
|
||||
public void setDir(String dir) {
|
||||
this.dir = dir;
|
||||
}
|
||||
|
||||
public void setBaseUrl(String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public void setKeywords(List<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,22 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor
|
||||
@Setter @Getter
|
||||
class DirtreeSideloadSpecList {
|
||||
public List<DirtreeSideloadSpec> sources;
|
||||
|
||||
public DirtreeSideloadSpecList(List<DirtreeSideloadSpec> sources) {
|
||||
this.sources = sources;
|
||||
}
|
||||
|
||||
public DirtreeSideloadSpecList() {
|
||||
}
|
||||
|
||||
public List<DirtreeSideloadSpec> getSources() {
|
||||
return this.sources;
|
||||
}
|
||||
|
||||
public void setSources(List<DirtreeSideloadSpec> sources) {
|
||||
this.sources = sources;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
@ -13,6 +12,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDate;
|
||||
@ -72,24 +72,28 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
||||
return name.endsWith(".html") || name.endsWith(".htm");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessedDocument process(Path path) {
|
||||
String body = Files.readString(path);
|
||||
String url = urlBase + dirBase.relativize(path);
|
||||
try {
|
||||
String body = Files.readString(path);
|
||||
String url = urlBase + dirBase.relativize(path);
|
||||
|
||||
// We trim "/index.html"-suffixes from the index if they are present,
|
||||
// since this is typically an artifact from document retrieval
|
||||
if (url.endsWith("/index.html")) {
|
||||
url = url.substring(0, url.length() - "index.html".length());
|
||||
// We trim "/index.html"-suffixes from the index if they are present,
|
||||
// since this is typically an artifact from document retrieval
|
||||
if (url.endsWith("/index.html")) {
|
||||
url = url.substring(0, url.length() - "index.html".length());
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.NORMAL,
|
||||
new LinkTexts(),
|
||||
LocalDate.now().getYear(),
|
||||
10_000);
|
||||
}
|
||||
catch (IOException | URISyntaxException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.NORMAL,
|
||||
new LinkTexts(),
|
||||
LocalDate.now().getYear(),
|
||||
10_000);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.converting.sideload.encyclopedia;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
@ -78,7 +77,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
return ret;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||
// This leaks a thread pool, but it doesn't matter since this is a one-off process
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.sideload.stackexchange;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDocumentDetails;
|
||||
@ -39,15 +38,19 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
private final Path dbFile;
|
||||
|
||||
@SneakyThrows
|
||||
public StackexchangeSideloader(Path pathToDbFile,
|
||||
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
|
||||
DocumentKeywordExtractor keywordExtractor
|
||||
) {
|
||||
this.dbFile = pathToDbFile;
|
||||
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
try {
|
||||
this.dbFile = pathToDbFile;
|
||||
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
|
||||
this.sentenceExtractorProvider = sentenceExtractorProvider;
|
||||
this.keywordExtractor = keywordExtractor;
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -80,12 +83,16 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
|
||||
ProcessedDocument nextModel = null;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (nextModel != null)
|
||||
return true;
|
||||
nextModel = postsReader.next();
|
||||
try {
|
||||
nextModel = postsReader.next();
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
|
||||
return nextModel != null;
|
||||
}
|
||||
@ -103,7 +110,6 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
};
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) {
|
||||
String fullUrl = "https://" + domainName + "/questions/" + post.threadId();
|
||||
|
||||
@ -163,7 +169,7 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
ret.stateReason = "SIDELOAD";
|
||||
}
|
||||
catch (Exception e) {
|
||||
ret.url = new EdgeUrl(fullUrl);
|
||||
ret.url = EdgeUrl.parse(fullUrl).orElseThrow();
|
||||
ret.state = UrlIndexingState.DISQUALIFIED;
|
||||
ret.stateReason = "SIDELOAD";
|
||||
}
|
||||
@ -186,9 +192,14 @@ public class StackexchangeSideloader implements SideloadSource {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) {
|
||||
pool.submit(() -> results.put(convert(model)));
|
||||
try {
|
||||
pool.submit(() -> results.put(convert(model)));
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.sideload.warc;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.contenttype.ContentTypeParser;
|
||||
import nu.marginalia.contenttype.DocumentBodyToString;
|
||||
@ -38,17 +37,20 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
private final EdgeDomain domain;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public WarcSideloader(Path warcFile,
|
||||
SideloaderProcessing sideloaderProcessing)
|
||||
{
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.reader = new WarcReader(warcFile);
|
||||
this.domain = sniffDomainFromWarc()
|
||||
.orElseThrow(() -> new IOException("Could not identify domain from warc file"));
|
||||
try {
|
||||
this.sideloaderProcessing = sideloaderProcessing;
|
||||
this.reader = new WarcReader(warcFile);
|
||||
this.domain = sniffDomainFromWarc()
|
||||
.orElseThrow(() -> new IOException("Could not identify domain from warc file"));
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public ProcessedDomain getDomain() {
|
||||
var ret = new ProcessedDomain();
|
||||
@ -81,7 +83,6 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||
return reader.records()
|
||||
@ -111,13 +112,12 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
|
||||
return true;
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
logger.warn("Failed to process response", e);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<ProcessedDocument> process(WarcResponse response) {
|
||||
Optional<String> body = getBody(response);
|
||||
String url = response.target();
|
||||
@ -132,33 +132,46 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
return Optional.of(sideloaderProcessing
|
||||
.processDocument(url,
|
||||
body.get(),
|
||||
List.of(),
|
||||
new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.SIDELOAD,
|
||||
new LinkTexts(),
|
||||
LocalDate.now().getYear(), // TODO: This should be the actual year of the document
|
||||
10_000));
|
||||
try {
|
||||
return Optional.of(sideloaderProcessing
|
||||
.processDocument(url,
|
||||
body.get(),
|
||||
List.of(),
|
||||
new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.SIDELOAD,
|
||||
new LinkTexts(),
|
||||
LocalDate.now().getYear(), // TODO: This should be the actual year of the document
|
||||
10_000));
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Failed to process document", e);
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private Optional<String> getBody(WarcResponse response) {
|
||||
var http = response.http();
|
||||
|
||||
// TODO: We should support additional encodings here
|
||||
try (var body = http.body()) {
|
||||
String contentType = http.headers().first("Content-Type").orElse(null);
|
||||
byte[] bytes = body.stream().readAllBytes();
|
||||
try {
|
||||
var http = response.http();
|
||||
|
||||
var ct = ContentTypeParser.parseContentType(contentType, bytes);
|
||||
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
|
||||
|
||||
// TODO: We should support additional encodings here
|
||||
try (var body = http.body()) {
|
||||
String contentType = http.headers().first("Content-Type").orElse(null);
|
||||
byte[] bytes = body.stream().readAllBytes();
|
||||
|
||||
var ct = ContentTypeParser.parseContentType(contentType, bytes);
|
||||
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Failed to parse body", ex);
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.info("Failed to parse body", ex);
|
||||
catch (Exception e) {
|
||||
logger.warn("Failed to process response", e);
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.writer;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
import nu.marginalia.converting.sideload.SideloadSource;
|
||||
@ -61,7 +60,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public void writeProcessedDomain(ProcessedDomain domain) {
|
||||
try {
|
||||
if (domain.documents != null) {
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.writer;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.worklog.BatchingWorkLog;
|
||||
import org.jetbrains.annotations.Nullable;
|
||||
import org.slf4j.Logger;
|
||||
@ -40,48 +39,55 @@ public class ConverterWriter implements AutoCloseable {
|
||||
workerThread.start();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(@Nullable ConverterBatchWritableIf domain) {
|
||||
if (null == domain)
|
||||
return;
|
||||
|
||||
domainData.put(domain);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void writerThread() {
|
||||
IntervalAction switcher = new IntervalAction(this::switchBatch, switchInterval);
|
||||
|
||||
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
|
||||
|
||||
while (running || !domainData.isEmpty()) {
|
||||
// poll with a timeout so we have an
|
||||
// opportunity to check the running condition
|
||||
// ... we could interrupt the thread as well, but
|
||||
// as we enter third party code it's difficult to guarantee it will deal
|
||||
// well with being interrupted
|
||||
var data = domainData.poll(1, TimeUnit.SECONDS);
|
||||
|
||||
if (data == null)
|
||||
continue;
|
||||
|
||||
String id = data.id();
|
||||
|
||||
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
|
||||
logger.warn("Skipping already logged item {}", id);
|
||||
data.close();
|
||||
continue;
|
||||
}
|
||||
|
||||
currentWriter.write(data);
|
||||
|
||||
workLog.logItem(id);
|
||||
|
||||
switcher.tick();
|
||||
try {
|
||||
domainData.put(domain);
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private void writerThread() {
|
||||
try {
|
||||
IntervalAction switcher = new IntervalAction(this::switchBatch, switchInterval);
|
||||
|
||||
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
|
||||
|
||||
while (running || !domainData.isEmpty()) {
|
||||
// poll with a timeout so we have an
|
||||
// opportunity to check the running condition
|
||||
// ... we could interrupt the thread as well, but
|
||||
// as we enter third party code it's difficult to guarantee it will deal
|
||||
// well with being interrupted
|
||||
var data = domainData.poll(1, TimeUnit.SECONDS);
|
||||
|
||||
if (data == null)
|
||||
continue;
|
||||
|
||||
String id = data.id();
|
||||
|
||||
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
|
||||
logger.warn("Skipping already logged item {}", id);
|
||||
data.close();
|
||||
continue;
|
||||
}
|
||||
|
||||
currentWriter.write(data);
|
||||
|
||||
workLog.logItem(id);
|
||||
|
||||
switcher.tick();
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Writer thread failed", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean switchBatch() {
|
||||
if (workLog.isCurrentBatchEmpty()) {
|
||||
// Nothing to commit
|
||||
@ -89,13 +95,18 @@ public class ConverterWriter implements AutoCloseable {
|
||||
}
|
||||
|
||||
|
||||
// order matters here
|
||||
currentWriter.close();
|
||||
workLog.logFinishedBatch();
|
||||
logger.info("Switching to batch {}", workLog.getBatchNumber());
|
||||
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
|
||||
try {
|
||||
// order matters here
|
||||
currentWriter.close();
|
||||
workLog.logFinishedBatch();
|
||||
logger.info("Switching to batch {}", workLog.getBatchNumber());
|
||||
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
|
||||
|
||||
return true;
|
||||
return true;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.integration.reddit.db;
|
||||
|
||||
import com.google.common.base.Strings;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.integration.reddit.RedditEntryReader;
|
||||
import nu.marginalia.integration.reddit.model.ProcessableRedditComment;
|
||||
import nu.marginalia.integration.reddit.model.ProcessableRedditSubmission;
|
||||
@ -175,28 +174,35 @@ public class RedditDb {
|
||||
stmt.close();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (hasNext != null)
|
||||
return hasNext;
|
||||
|
||||
hasNext = resultSet.next();
|
||||
try {
|
||||
hasNext = resultSet.next();
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return hasNext;
|
||||
}
|
||||
|
||||
abstract T nextFromResultSet(ResultSet resultSet) throws SQLException;
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public T next() {
|
||||
if (!hasNext())
|
||||
throw new IllegalStateException();
|
||||
else hasNext = null;
|
||||
|
||||
return nextFromResultSet(resultSet);
|
||||
|
||||
try {
|
||||
return nextFromResultSet(resultSet);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,9 @@
|
||||
package nu.marginalia.integration.reddit.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
/** A projection of a Reddit comment joined with its top level submission
|
||||
* that is ready for processing. */
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
/**
|
||||
* A projection of a Reddit comment joined with its top level submission
|
||||
* that is ready for processing.
|
||||
*/
|
||||
public class ProcessableRedditComment {
|
||||
public String subreddit;
|
||||
public String name;
|
||||
@ -16,4 +13,19 @@ public class ProcessableRedditComment {
|
||||
public int created_utc;
|
||||
public String permalink;
|
||||
public int score;
|
||||
|
||||
public ProcessableRedditComment(String subreddit, String name, String author, String title, String body, int created_utc, String permalink, int score) {
|
||||
this.subreddit = subreddit;
|
||||
this.name = name;
|
||||
this.author = author;
|
||||
this.title = title;
|
||||
this.body = body;
|
||||
this.created_utc = created_utc;
|
||||
this.permalink = permalink;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ProcessableRedditComment(subreddit=" + this.subreddit + ", name=" + this.name + ", author=" + this.author + ", title=" + this.title + ", body=" + this.body + ", created_utc=" + this.created_utc + ", permalink=" + this.permalink + ", score=" + this.score + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,10 +1,8 @@
|
||||
package nu.marginalia.integration.reddit.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
/** A projection of a Reddit top level submission that is appropriate for processing. */
|
||||
@AllArgsConstructor @ToString
|
||||
/**
|
||||
* A projection of a Reddit top level submission that is appropriate for processing.
|
||||
*/
|
||||
public class ProcessableRedditSubmission {
|
||||
public String subreddit;
|
||||
public String name;
|
||||
@ -14,4 +12,19 @@ public class ProcessableRedditSubmission {
|
||||
public int created_utc;
|
||||
public String permalink;
|
||||
public int score;
|
||||
|
||||
public ProcessableRedditSubmission(String subreddit, String name, String author, String title, String selftext, int created_utc, String permalink, int score) {
|
||||
this.subreddit = subreddit;
|
||||
this.name = name;
|
||||
this.author = author;
|
||||
this.title = title;
|
||||
this.selftext = selftext;
|
||||
this.created_utc = created_utc;
|
||||
this.permalink = permalink;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ProcessableRedditSubmission(subreddit=" + this.subreddit + ", name=" + this.name + ", author=" + this.author + ", title=" + this.title + ", selftext=" + this.selftext + ", created_utc=" + this.created_utc + ", permalink=" + this.permalink + ", score=" + this.score + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,9 @@
|
||||
package nu.marginalia.integration.reddit.model;
|
||||
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import lombok.With;
|
||||
|
||||
/** Corresponds directly to the pushshift.io Reddit comment JSON format. */
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
@With
|
||||
/**
|
||||
* Corresponds directly to the pushshift.io Reddit comment JSON format.
|
||||
*/
|
||||
public class RawRedditComment {
|
||||
public String parent_id;
|
||||
public String link_id;
|
||||
@ -17,4 +12,46 @@ public class RawRedditComment {
|
||||
public String body;
|
||||
public String subreddit;
|
||||
public int score;
|
||||
|
||||
public RawRedditComment(String parent_id, String link_id, String id, String author, String body, String subreddit, int score) {
|
||||
this.parent_id = parent_id;
|
||||
this.link_id = link_id;
|
||||
this.id = id;
|
||||
this.author = author;
|
||||
this.body = body;
|
||||
this.subreddit = subreddit;
|
||||
this.score = score;
|
||||
}
|
||||
|
||||
public RawRedditComment withParent_id(String parent_id) {
|
||||
return this.parent_id == parent_id ? this : new RawRedditComment(parent_id, this.link_id, this.id, this.author, this.body, this.subreddit, this.score);
|
||||
}
|
||||
|
||||
public RawRedditComment withLink_id(String link_id) {
|
||||
return this.link_id == link_id ? this : new RawRedditComment(this.parent_id, link_id, this.id, this.author, this.body, this.subreddit, this.score);
|
||||
}
|
||||
|
||||
public RawRedditComment withId(String id) {
|
||||
return this.id == id ? this : new RawRedditComment(this.parent_id, this.link_id, id, this.author, this.body, this.subreddit, this.score);
|
||||
}
|
||||
|
||||
public RawRedditComment withAuthor(String author) {
|
||||
return this.author == author ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, author, this.body, this.subreddit, this.score);
|
||||
}
|
||||
|
||||
public RawRedditComment withBody(String body) {
|
||||
return this.body == body ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, this.author, body, this.subreddit, this.score);
|
||||
}
|
||||
|
||||
public RawRedditComment withSubreddit(String subreddit) {
|
||||
return this.subreddit == subreddit ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, this.author, this.body, subreddit, this.score);
|
||||
}
|
||||
|
||||
public RawRedditComment withScore(int score) {
|
||||
return this.score == score ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, this.author, this.body, this.subreddit, score);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "RawRedditComment(parent_id=" + this.parent_id + ", link_id=" + this.link_id + ", id=" + this.id + ", author=" + this.author + ", body=" + this.body + ", subreddit=" + this.subreddit + ", score=" + this.score + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,9 @@
|
||||
package nu.marginalia.integration.reddit.model;
|
||||
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.ToString;
|
||||
import lombok.With;
|
||||
|
||||
/** Corresponds directly to the pushshift.io Reddit submission JSON format. */
|
||||
@AllArgsConstructor
|
||||
@With
|
||||
@ToString
|
||||
/**
|
||||
* Corresponds directly to the pushshift.io Reddit submission JSON format.
|
||||
*/
|
||||
public class RawRedditSubmission {
|
||||
public int score;
|
||||
public String subreddit;
|
||||
@ -19,4 +14,56 @@ public class RawRedditSubmission {
|
||||
public int num_comments;
|
||||
public int created_utc;
|
||||
public String permalink;
|
||||
|
||||
public RawRedditSubmission(int score, String subreddit, String name, String author, String title, String selftext, int num_comments, int created_utc, String permalink) {
|
||||
this.score = score;
|
||||
this.subreddit = subreddit;
|
||||
this.name = name;
|
||||
this.author = author;
|
||||
this.title = title;
|
||||
this.selftext = selftext;
|
||||
this.num_comments = num_comments;
|
||||
this.created_utc = created_utc;
|
||||
this.permalink = permalink;
|
||||
}
|
||||
|
||||
public RawRedditSubmission withScore(int score) {
|
||||
return this.score == score ? this : new RawRedditSubmission(score, this.subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withSubreddit(String subreddit) {
|
||||
return this.subreddit == subreddit ? this : new RawRedditSubmission(this.score, subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withName(String name) {
|
||||
return this.name == name ? this : new RawRedditSubmission(this.score, this.subreddit, name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withAuthor(String author) {
|
||||
return this.author == author ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withTitle(String title) {
|
||||
return this.title == title ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, title, this.selftext, this.num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withSelftext(String selftext) {
|
||||
return this.selftext == selftext ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, selftext, this.num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withNum_comments(int num_comments) {
|
||||
return this.num_comments == num_comments ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, this.selftext, num_comments, this.created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withCreated_utc(int created_utc) {
|
||||
return this.created_utc == created_utc ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, created_utc, this.permalink);
|
||||
}
|
||||
|
||||
public RawRedditSubmission withPermalink(String permalink) {
|
||||
return this.permalink == permalink ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, permalink);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "RawRedditSubmission(score=" + this.score + ", subreddit=" + this.subreddit + ", name=" + this.name + ", author=" + this.author + ", title=" + this.title + ", selftext=" + this.selftext + ", num_comments=" + this.num_comments + ", created_utc=" + this.created_utc + ", permalink=" + this.permalink + ")";
|
||||
}
|
||||
}
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.integration.stackexchange.sqlite;
|
||||
import com.github.luben.zstd.Zstd;
|
||||
import gnu.trove.list.TIntList;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader;
|
||||
|
||||
import javax.xml.stream.XMLStreamException;
|
||||
@ -15,6 +14,7 @@ import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.function.Predicate;
|
||||
@ -32,10 +32,9 @@ import java.util.function.Predicate;
|
||||
public class StackExchangePostsDb {
|
||||
|
||||
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
|
||||
@SneakyThrows
|
||||
public static void create(String domain,
|
||||
Path sqliteFile,
|
||||
Path stackExchange7zFile) {
|
||||
Path stackExchange7zFile) throws IOException {
|
||||
Files.deleteIfExists(sqliteFile);
|
||||
|
||||
String connStr = "jdbc:sqlite:" + sqliteFile;
|
||||
@ -115,7 +114,6 @@ public class StackExchangePostsDb {
|
||||
* necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode
|
||||
* for a single domain, despite having less than 67 million 'threads'.
|
||||
* */
|
||||
@SneakyThrows
|
||||
public static void forEachPost(
|
||||
Path sqliteFile,
|
||||
Predicate<CombinedPostModel> consumer) {
|
||||
@ -189,8 +187,8 @@ public class StackExchangePostsDb {
|
||||
}
|
||||
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
ex.printStackTrace();
|
||||
catch (SQLException | InterruptedException | ExecutionException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.integration.stackexchange.xml;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
import javax.xml.stream.XMLEventReader;
|
||||
import javax.xml.stream.events.XMLEvent;
|
||||
import java.util.Iterator;
|
||||
@ -22,25 +20,29 @@ class StackExchangeXmlIterator<T> implements Iterator<T> {
|
||||
this.parser = parser;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (next != null)
|
||||
return true;
|
||||
|
||||
while (xmlReader.hasNext()) {
|
||||
XMLEvent event = xmlReader.nextEvent();
|
||||
try {
|
||||
while (xmlReader.hasNext()) {
|
||||
XMLEvent event = xmlReader.nextEvent();
|
||||
|
||||
if (!event.isStartElement())
|
||||
continue;
|
||||
if (!event.isStartElement())
|
||||
continue;
|
||||
|
||||
next = parser.apply(event);
|
||||
next = parser.apply(event);
|
||||
|
||||
if (next != null)
|
||||
return true;
|
||||
if (next != null)
|
||||
return true;
|
||||
}
|
||||
|
||||
readerSource.close();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
readerSource.close();
|
||||
|
||||
return false;
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.model.processed;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.sequence.VarintCodedSequence;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
import nu.marginalia.slop.SlopTable;
|
||||
@ -52,7 +51,6 @@ public record SlopDocumentRecord(
|
||||
throw new IllegalArgumentException("Metas, words and positions must have the same length");
|
||||
}
|
||||
|
||||
@Builder
|
||||
public record KeywordsProjection(
|
||||
String domain,
|
||||
int ordinal,
|
||||
@ -63,8 +61,11 @@ public record SlopDocumentRecord(
|
||||
byte[] metas,
|
||||
List<VarintCodedSequence> positions,
|
||||
byte[] spanCodes,
|
||||
List<VarintCodedSequence> spans)
|
||||
{
|
||||
List<VarintCodedSequence> spans) {
|
||||
public static KeywordsProjectionBuilder builder() {
|
||||
return new KeywordsProjectionBuilder();
|
||||
}
|
||||
|
||||
// Override the equals method since records don't generate default equals that deal with array fields properly
|
||||
@Override
|
||||
public boolean equals(Object o) {
|
||||
@ -88,6 +89,80 @@ public record SlopDocumentRecord(
|
||||
result = 31 * result + Objects.hashCode(spans);
|
||||
return result;
|
||||
}
|
||||
|
||||
public static class KeywordsProjectionBuilder {
|
||||
private String domain;
|
||||
private int ordinal;
|
||||
private int htmlFeatures;
|
||||
private long documentMetadata;
|
||||
private int length;
|
||||
private List<String> words;
|
||||
private byte[] metas;
|
||||
private List<VarintCodedSequence> positions;
|
||||
private byte[] spanCodes;
|
||||
private List<VarintCodedSequence> spans;
|
||||
|
||||
KeywordsProjectionBuilder() {
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder domain(String domain) {
|
||||
this.domain = domain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder ordinal(int ordinal) {
|
||||
this.ordinal = ordinal;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder htmlFeatures(int htmlFeatures) {
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder documentMetadata(long documentMetadata) {
|
||||
this.documentMetadata = documentMetadata;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder length(int length) {
|
||||
this.length = length;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder words(List<String> words) {
|
||||
this.words = words;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder metas(byte[] metas) {
|
||||
this.metas = metas;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder positions(List<VarintCodedSequence> positions) {
|
||||
this.positions = positions;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder spanCodes(byte[] spanCodes) {
|
||||
this.spanCodes = spanCodes;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjectionBuilder spans(List<VarintCodedSequence> spans) {
|
||||
this.spans = spans;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordsProjection build() {
|
||||
return new KeywordsProjection(this.domain, this.ordinal, this.htmlFeatures, this.documentMetadata, this.length, this.words, this.metas, this.positions, this.spanCodes, this.spans);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SlopDocumentRecord.KeywordsProjection.KeywordsProjectionBuilder(domain=" + this.domain + ", ordinal=" + this.ordinal + ", htmlFeatures=" + this.htmlFeatures + ", documentMetadata=" + this.documentMetadata + ", length=" + this.length + ", words=" + this.words + ", metas=" + Arrays.toString(this.metas) + ", positions=" + this.positions + ", spanCodes=" + Arrays.toString(this.spanCodes) + ", spans=" + this.spans + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public record MetadataProjection(
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.converting;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Injector;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.model.ProcessedDomain;
|
||||
@ -48,16 +47,14 @@ public class CrawlingThenConvertingIntegrationTest {
|
||||
private Path fileName;
|
||||
private Path fileName2;
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeAll
|
||||
public static void setUpAll() {
|
||||
// this must be done to avoid java inserting its own user agent for the sitemap requests
|
||||
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
public void setUp() throws IOException {
|
||||
Injector injector = Guice.createInjector(
|
||||
new ConvertingIntegrationTestModule()
|
||||
);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.processor.summary;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.summary.heuristic.*;
|
||||
import nu.marginalia.keyword.DocumentKeywordExtractor;
|
||||
@ -36,8 +35,7 @@ class SummaryExtractorTest {
|
||||
new FallbackHeuristic());
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
Set<String> getImportantWords(Document doc) {
|
||||
Set<String> getImportantWords(Document doc) throws URISyntaxException {
|
||||
var dld = setenceExtractor.extractSentences(doc);
|
||||
var keywords = keywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl(
|
||||
"https://www.marginalia.nu/"
|
||||
@ -92,7 +90,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractSurrey() throws IOException {
|
||||
void extractSurrey() throws IOException, URISyntaxException {
|
||||
String html = readClassPathFile("html/summarization/surrey.html");
|
||||
var doc = Jsoup.parse(html);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
@ -104,7 +102,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractSurrey1() throws IOException {
|
||||
void extractSurrey1() throws IOException, URISyntaxException {
|
||||
String html = readClassPathFile("html/summarization/surrey.html.1");
|
||||
var doc = Jsoup.parse(html);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
@ -115,7 +113,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extract187() throws IOException {
|
||||
void extract187() throws IOException, URISyntaxException {
|
||||
String html = readClassPathFile("html/summarization/187.shtml");
|
||||
var doc = Jsoup.parse(html);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
@ -126,7 +124,7 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
void extractMonadnock() throws IOException {
|
||||
void extractMonadnock() throws IOException, URISyntaxException {
|
||||
String html = readClassPathFile("html/monadnock.html");
|
||||
|
||||
var doc = Jsoup.parse(html);
|
||||
@ -138,13 +136,16 @@ class SummaryExtractorTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testWorkSet() throws IOException {
|
||||
public void testWorkSet() throws IOException, URISyntaxException {
|
||||
var workSet = readWorkSet();
|
||||
workSet.forEach((path, str) -> {
|
||||
for (Map.Entry<Path, String> entry : workSet.entrySet()) {
|
||||
final Path path = entry.getKey();
|
||||
final String str = entry.getValue();
|
||||
|
||||
var doc = Jsoup.parse(str);
|
||||
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
|
||||
System.out.println(path + ": " + summary);
|
||||
});
|
||||
}
|
||||
}
|
||||
private String readClassPathFile(String s) throws IOException {
|
||||
return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes());
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.link_parser;
|
||||
|
||||
import com.google.common.base.CharMatcher;
|
||||
import com.google.common.base.Strings;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.util.QueryParams;
|
||||
import org.jetbrains.annotations.Contract;
|
||||
@ -122,14 +121,19 @@ public class LinkParser {
|
||||
return Optional.ofNullable(matcher.group(1));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private URI renormalize(URI uri) {
|
||||
if (uri.getPath() == null) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
|
||||
try {
|
||||
if (uri.getPath() == null) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
|
||||
}
|
||||
if (uri.getPath().startsWith("/../")) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
|
||||
}
|
||||
}
|
||||
if (uri.getPath().startsWith("/../")) {
|
||||
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
|
||||
catch (URISyntaxException e) {
|
||||
logger.warn("Bad URI {}", uri);
|
||||
}
|
||||
|
||||
return uri;
|
||||
}
|
||||
|
||||
@ -146,7 +150,6 @@ public class LinkParser {
|
||||
private static final Pattern spaceRegex = Pattern.compile(" ");
|
||||
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
|
||||
|
||||
@SneakyThrows
|
||||
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
|
||||
|
||||
// url looks like http://www.marginalia.nu/
|
||||
|
@ -5,7 +5,6 @@ import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.ProcessConfiguration;
|
||||
import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.UserAgent;
|
||||
@ -475,7 +474,6 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
}
|
||||
}
|
||||
|
||||
@Builder
|
||||
public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) {
|
||||
|
||||
public CrawlSpecRecord(String domain, int crawlDepth) {
|
||||
@ -487,13 +485,48 @@ public class CrawlerMain extends ProcessMainClass {
|
||||
// already fetched, and a growth factor that gets a bonus for small domains
|
||||
return new CrawlSpecRecord(domain,
|
||||
(int) Math.clamp(
|
||||
(visitedUrls * (visitedUrls < MID_URLS_PER_DOMAIN
|
||||
? Math.max(2.5, URL_GROWTH_FACTOR)
|
||||
: URL_GROWTH_FACTOR)
|
||||
),
|
||||
MIN_URLS_PER_DOMAIN,
|
||||
MAX_URLS_PER_DOMAIN));
|
||||
(visitedUrls * (visitedUrls < MID_URLS_PER_DOMAIN
|
||||
? Math.max(2.5, URL_GROWTH_FACTOR)
|
||||
: URL_GROWTH_FACTOR)
|
||||
),
|
||||
MIN_URLS_PER_DOMAIN,
|
||||
MAX_URLS_PER_DOMAIN));
|
||||
}
|
||||
|
||||
public static CrawlSpecRecordBuilder builder() {
|
||||
return new CrawlSpecRecordBuilder();
|
||||
}
|
||||
|
||||
public static class CrawlSpecRecordBuilder {
|
||||
private @NotNull String domain;
|
||||
private int crawlDepth;
|
||||
private @NotNull List<String> urls;
|
||||
|
||||
CrawlSpecRecordBuilder() {
|
||||
}
|
||||
|
||||
public CrawlSpecRecordBuilder domain(@NotNull String domain) {
|
||||
this.domain = domain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawlSpecRecordBuilder crawlDepth(int crawlDepth) {
|
||||
this.crawlDepth = crawlDepth;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawlSpecRecordBuilder urls(@NotNull List<String> urls) {
|
||||
this.urls = urls;
|
||||
return this;
|
||||
}
|
||||
|
||||
public CrawlSpecRecord build() {
|
||||
return new CrawlSpecRecord(this.domain, this.crawlDepth, this.urls);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "CrawlerMain.CrawlSpecRecord.CrawlSpecRecordBuilder(domain=" + this.domain + ", crawlDepth=" + this.crawlDepth + ", urls=" + this.urls + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,13 +2,12 @@ package nu.marginalia.crawl;
|
||||
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.AbstractModule;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
|
||||
public class CrawlerModule extends AbstractModule {
|
||||
@SneakyThrows
|
||||
|
||||
public void configure() {
|
||||
bind(Gson.class).toInstance(createGson());
|
||||
bind(UserAgent.class).toInstance(WmsaHome.getUserAgent());
|
||||
|
@ -27,7 +27,7 @@ public interface HttpFetcher {
|
||||
HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder recorder,
|
||||
ContentTags tags,
|
||||
ProbeType probeType) throws HttpFetcherImpl.RateLimitException;
|
||||
ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
|
||||
|
||||
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);
|
||||
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.crawl.fetcher;
|
||||
import com.google.inject.Inject;
|
||||
import crawlercommons.robots.SimpleRobotRules;
|
||||
import crawlercommons.robots.SimpleRobotRulesParser;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.UserAgent;
|
||||
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
|
||||
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
|
||||
@ -50,7 +49,6 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
|
||||
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
|
||||
|
||||
@SneakyThrows
|
||||
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
|
||||
var builder = new OkHttpClient.Builder();
|
||||
if (dispatcher != null) {
|
||||
@ -111,7 +109,6 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
* @return The result of the probe, indicating the state and the URL.
|
||||
*/
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public DomainProbeResult probeDomain(EdgeUrl url) {
|
||||
var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
|
||||
.url(url.toString())
|
||||
@ -207,11 +204,11 @@ public class HttpFetcherImpl implements HttpFetcher {
|
||||
* the outcome of the fetch.
|
||||
*/
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public HttpFetchResult fetchContent(EdgeUrl url,
|
||||
WarcRecorder warcRecorder,
|
||||
ContentTags contentTags,
|
||||
ProbeType probeType)
|
||||
throws Exception
|
||||
{
|
||||
var getBuilder = new Request.Builder().get();
|
||||
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.crawl.fetcher.socket;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
import javax.net.ssl.*;
|
||||
import java.security.cert.X509Certificate;
|
||||
|
||||
@ -29,20 +27,24 @@ public class NoSecuritySSL {
|
||||
}
|
||||
};
|
||||
|
||||
@SneakyThrows
|
||||
public static SSLSocketFactory buildSocketFactory() {
|
||||
// Install the all-trusting trust manager
|
||||
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
||||
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
|
||||
try {
|
||||
// Install the all-trusting trust manager
|
||||
final SSLContext sslContext = SSLContext.getInstance("TLS");
|
||||
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
|
||||
|
||||
var clientSessionContext = sslContext.getClientSessionContext();
|
||||
var clientSessionContext = sslContext.getClientSessionContext();
|
||||
|
||||
// The default value for this is very high and will use a crapload of memory
|
||||
// since the crawler will be making a lot of requests to various hosts
|
||||
clientSessionContext.setSessionCacheSize(2048);
|
||||
// The default value for this is very high and will use a crapload of memory
|
||||
// since the crawler will be making a lot of requests to various hosts
|
||||
clientSessionContext.setSessionCacheSize(2048);
|
||||
|
||||
// Create a ssl socket factory with our all-trusting manager
|
||||
return sslContext.getSocketFactory();
|
||||
// Create a ssl socket factory with our all-trusting manager
|
||||
return sslContext.getSocketFactory();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public static HostnameVerifier buildHostnameVerifyer() {
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user