(chore) Remove lombok

There are likely some instances of delombok gore with this commit.
This commit is contained in:
Viktor Lofgren 2024-11-11 21:14:38 +01:00
parent a5b4951f23
commit 9f47ce8d15
162 changed files with 3820 additions and 1316 deletions

View File

@ -1,7 +1,6 @@
plugins { plugins {
id 'java' id 'java'
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0" id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
id "io.freefair.lombok" version "8.3"
id "me.champeau.jmh" version "0.6.6" id "me.champeau.jmh" version "0.6.6"
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly // This is a workaround for a bug in the Jib plugin that causes it to stall randomly
@ -44,8 +43,8 @@ subprojects.forEach {it ->
} }
ext { ext {
jvmVersion=22 jvmVersion=23
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22' dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
dockerImageTag='latest' dockerImageTag='latest'
dockerImageRegistry='marginalia' dockerImageRegistry='marginalia'
jibVersion = '3.4.3' jibVersion = '3.4.3'

View File

@ -1,10 +1,7 @@
package nu.marginalia; package nu.marginalia;
import lombok.Builder;
import java.nio.file.Path; import java.nio.file.Path;
@Builder
public class LanguageModels { public class LanguageModels {
public final Path termFrequencies; public final Path termFrequencies;
@ -30,4 +27,64 @@ public class LanguageModels {
this.fasttextLanguageModel = fasttextLanguageModel; this.fasttextLanguageModel = fasttextLanguageModel;
this.segments = segments; this.segments = segments;
} }
public static LanguageModelsBuilder builder() {
return new LanguageModelsBuilder();
}
public static class LanguageModelsBuilder {
private Path termFrequencies;
private Path openNLPSentenceDetectionData;
private Path posRules;
private Path posDict;
private Path openNLPTokenData;
private Path fasttextLanguageModel;
private Path segments;
LanguageModelsBuilder() {
}
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
this.termFrequencies = termFrequencies;
return this;
}
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
return this;
}
public LanguageModelsBuilder posRules(Path posRules) {
this.posRules = posRules;
return this;
}
public LanguageModelsBuilder posDict(Path posDict) {
this.posDict = posDict;
return this;
}
public LanguageModelsBuilder openNLPTokenData(Path openNLPTokenData) {
this.openNLPTokenData = openNLPTokenData;
return this;
}
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
this.fasttextLanguageModel = fasttextLanguageModel;
return this;
}
public LanguageModelsBuilder segments(Path segments) {
this.segments = segments;
return this;
}
public LanguageModels build() {
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.openNLPTokenData, this.fasttextLanguageModel, this.segments);
}
public String toString() {
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", openNLPTokenData=" + this.openNLPTokenData + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
}
}
} }

View File

@ -7,12 +7,13 @@ import com.google.common.util.concurrent.UncheckedExecutionException;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import java.sql.SQLException;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import java.util.Optional; import java.util.Optional;
import java.util.OptionalInt; import java.util.OptionalInt;
import java.util.concurrent.ExecutionException;
@Singleton @Singleton
public class DbDomainQueries { public class DbDomainQueries {
@ -27,7 +28,6 @@ public class DbDomainQueries {
} }
@SneakyThrows
public Integer getDomainId(EdgeDomain domain) { public Integer getDomainId(EdgeDomain domain) {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
@ -42,12 +42,14 @@ public class DbDomainQueries {
throw new NoSuchElementException(); throw new NoSuchElementException();
}); });
} }
catch (UncheckedExecutionException ex) { catch (ExecutionException ex) {
throw ex.getCause(); throw new RuntimeException(ex.getCause());
}
catch (SQLException ex) {
throw new RuntimeException(ex);
} }
} }
@SneakyThrows
public OptionalInt tryGetDomainId(EdgeDomain domain) { public OptionalInt tryGetDomainId(EdgeDomain domain) {
Integer maybeId = domainIdCache.getIfPresent(domain); Integer maybeId = domainIdCache.getIfPresent(domain);
@ -70,11 +72,13 @@ public class DbDomainQueries {
return OptionalInt.empty(); return OptionalInt.empty();
} }
catch (UncheckedExecutionException ex) { catch (UncheckedExecutionException ex) {
return OptionalInt.empty(); throw new RuntimeException(ex.getCause());
}
catch (SQLException ex) {
throw new RuntimeException(ex);
} }
} }
@SneakyThrows
public Optional<EdgeDomain> getDomain(int id) { public Optional<EdgeDomain> getDomain(int id) {
try (var connection = dataSource.getConnection()) { try (var connection = dataSource.getConnection()) {
@ -87,5 +91,11 @@ public class DbDomainQueries {
return Optional.empty(); return Optional.empty();
} }
} }
catch (UncheckedExecutionException ex) {
throw new RuntimeException(ex.getCause());
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.db;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.With;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -115,23 +114,23 @@ public class DomainRankingSetsService {
} }
} }
/** Defines a domain ranking set, parameters for the ranking algorithms. /**
* Defines a domain ranking set, parameters for the ranking algorithms.
* *
* @param name Key and name of the set * @param name Key and name of the set
* @param description Human-readable description * @param description Human-readable description
* @param depth Depth of the algorithm * @param depth Depth of the algorithm
* @param definition Definition of the set, typically a list of domains or globs for domain-names * @param definition Definition of the set, typically a list of domains or globs for domain-names
* */ */
@With
public record DomainRankingSet(String name, public record DomainRankingSet(String name,
String description, String description,
int depth, int depth,
String definition) String definition) {
{
public Path fileName(Path base) { public Path fileName(Path base) {
return base.resolve(name().toLowerCase() + ".dat"); return base.resolve(name().toLowerCase() + ".dat");
} }
public String[] domains() { public String[] domains() {
return Arrays.stream(definition().split("\n+")) return Arrays.stream(definition().split("\n+"))
.map(String::trim) .map(String::trim)
@ -144,5 +143,20 @@ public class DomainRankingSetsService {
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK"); return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
} }
public DomainRankingSet withName(String name) {
return this.name == name ? this : new DomainRankingSet(name, description, depth, definition);
}
public DomainRankingSet withDescription(String description) {
return this.description == description ? this : new DomainRankingSet(name, description, depth, definition);
}
public DomainRankingSet withDepth(int depth) {
return this.depth == depth ? this : new DomainRankingSet(name, description, depth, definition);
}
public DomainRankingSet withDefinition(String definition) {
return this.definition == definition ? this : new DomainRankingSet(name, description, depth, definition);
}
} }
} }

View File

@ -1,15 +1,11 @@
package nu.marginalia.model; package nu.marginalia.model;
import lombok.*;
import javax.annotation.Nonnull; import javax.annotation.Nonnull;
import java.io.Serializable; import java.io.Serializable;
import java.util.Objects; import java.util.Objects;
import java.util.function.Predicate; import java.util.function.Predicate;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@AllArgsConstructor
@Getter @Setter @Builder
public class EdgeDomain implements Serializable { public class EdgeDomain implements Serializable {
@Nonnull @Nonnull
@ -17,7 +13,6 @@ public class EdgeDomain implements Serializable {
@Nonnull @Nonnull
public final String topDomain; public final String topDomain;
@SneakyThrows
public EdgeDomain(String host) { public EdgeDomain(String host) {
Objects.requireNonNull(host, "domain name must not be null"); Objects.requireNonNull(host, "domain name must not be null");
@ -34,28 +29,23 @@ public class EdgeDomain implements Serializable {
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.> if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
subDomain = ""; subDomain = "";
topDomain = host; topDomain = host;
} } else {
else {
int dot2 = host.substring(0, dot).lastIndexOf('.'); int dot2 = host.substring(0, dot).lastIndexOf('.');
if (dot2 < 0) { if (dot2 < 0) {
subDomain = ""; subDomain = "";
topDomain = host; topDomain = host;
} } else {
else { if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk
if (looksLikeGovTld(host))
{ // Capture .ac.jp, .co.uk
int dot3 = host.substring(0, dot2).lastIndexOf('.'); int dot3 = host.substring(0, dot2).lastIndexOf('.');
if (dot3 >= 0) { if (dot3 >= 0) {
dot2 = dot3; dot2 = dot3;
subDomain = host.substring(0, dot2); subDomain = host.substring(0, dot2);
topDomain = host.substring(dot2 + 1); topDomain = host.substring(dot2 + 1);
} } else {
else {
subDomain = ""; subDomain = "";
topDomain = host; topDomain = host;
} }
} } else {
else {
subDomain = host.substring(0, dot2); subDomain = host.substring(0, dot2);
topDomain = host.substring(dot2 + 1); topDomain = host.substring(dot2 + 1);
} }
@ -64,6 +54,16 @@ public class EdgeDomain implements Serializable {
} }
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(id|ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate(); private static final Predicate<String> govListTest = Pattern.compile(".*\\.(id|ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
public EdgeDomain(@Nonnull String subDomain, @Nonnull String topDomain) {
this.subDomain = subDomain;
this.topDomain = topDomain;
}
public static EdgeDomainBuilder builder() {
return new EdgeDomainBuilder();
}
private boolean looksLikeGovTld(String host) { private boolean looksLikeGovTld(String host) {
if (host.length() < 8) if (host.length() < 8)
return false; return false;
@ -91,11 +91,11 @@ public class EdgeDomain implements Serializable {
} }
public EdgeUrl toRootUrlHttp() { public EdgeUrl toRootUrlHttp() {
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http // Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
return new EdgeUrl("http", this, null, "/", null); return new EdgeUrl("http", this, null, "/", null);
} }
public EdgeUrl toRootUrlHttps() { public EdgeUrl toRootUrlHttps() {
return new EdgeUrl("https", this, null, "/", null); return new EdgeUrl("https", this, null, "/", null);
} }
@ -125,8 +125,7 @@ public class EdgeDomain implements Serializable {
int cutPoint = topDomain.indexOf('.'); int cutPoint = topDomain.indexOf('.');
if (cutPoint < 0) { if (cutPoint < 0) {
ret.append(topDomain); ret.append(topDomain);
} } else {
else {
ret.append(topDomain, 0, cutPoint); ret.append(topDomain, 0, cutPoint);
} }
@ -155,16 +154,14 @@ public class EdgeDomain implements Serializable {
if (govListTest.test(topDomain)) { if (govListTest.test(topDomain)) {
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length())); dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
} } else {
else {
dot = topDomain.lastIndexOf('.'); dot = topDomain.lastIndexOf('.');
} }
if (dot < 0 || dot == topDomain.length() - 1) { if (dot < 0 || dot == topDomain.length() - 1) {
return "-"; return "-";
} } else {
else {
return topDomain.substring(dot + 1); return topDomain.substring(dot + 1);
} }
} }
@ -174,10 +171,10 @@ public class EdgeDomain implements Serializable {
if (!(o instanceof EdgeDomain other)) return false; if (!(o instanceof EdgeDomain other)) return false;
final String this$subDomain = this.getSubDomain(); final String this$subDomain = this.getSubDomain();
final String other$subDomain = other.getSubDomain(); final String other$subDomain = other.getSubDomain();
if (!Objects.equals(this$subDomain,other$subDomain)) return false; if (!Objects.equals(this$subDomain, other$subDomain)) return false;
final String this$domain = this.getTopDomain(); final String this$domain = this.getTopDomain();
final String other$domain = other.getTopDomain(); final String other$domain = other.getTopDomain();
if (!Objects.equals(this$domain,other$domain)) return false; if (!Objects.equals(this$domain, other$domain)) return false;
return true; return true;
} }
@ -191,4 +188,39 @@ public class EdgeDomain implements Serializable {
return result; return result;
} }
@Nonnull
public String getSubDomain() {
return this.subDomain;
}
@Nonnull
public String getTopDomain() {
return this.topDomain;
}
public static class EdgeDomainBuilder {
private String subDomain;
private String topDomain;
EdgeDomainBuilder() {
}
public EdgeDomainBuilder subDomain(String subDomain) {
this.subDomain = subDomain;
return this;
}
public EdgeDomainBuilder topDomain(String topDomain) {
this.topDomain = topDomain;
return this;
}
public EdgeDomain build() {
return new EdgeDomain(this.subDomain, this.topDomain);
}
public String toString() {
return "EdgeDomain.EdgeDomainBuilder(subDomain=" + this.subDomain + ", topDomain=" + this.topDomain + ")";
}
}
} }

View File

@ -1,8 +1,5 @@
package nu.marginalia.model; package nu.marginalia.model;
import lombok.Builder;
import lombok.Getter;
import lombok.Setter;
import nu.marginalia.util.QueryParams; import nu.marginalia.util.QueryParams;
import javax.annotation.Nullable; import javax.annotation.Nullable;
@ -15,7 +12,6 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@Getter @Setter @Builder
public class EdgeUrl implements Serializable { public class EdgeUrl implements Serializable {
public final String proto; public final String proto;
public final EdgeDomain domain; public final EdgeDomain domain;
@ -38,8 +34,7 @@ public class EdgeUrl implements Serializable {
private static URI parseURI(String url) throws URISyntaxException { private static URI parseURI(String url) throws URISyntaxException {
try { try {
return new URI(urlencodeFixer(url)); return new URI(urlencodeFixer(url));
} } catch (URISyntaxException ex) {
catch (URISyntaxException ex) {
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage()); throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
} }
} }
@ -83,20 +78,17 @@ public class EdgeUrl implements Serializable {
for (int i = pathIdx; i < end; i++) { for (int i = pathIdx; i < end; i++) {
int c = url.charAt(i); int c = url.charAt(i);
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) { if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
s.appendCodePoint(c); s.appendCodePoint(c);
} } else if (c == '%' && i + 2 < end) {
else if (c == '%' && i+2<end) { int cn = url.charAt(i + 1);
int cn = url.charAt(i+1); int cnn = url.charAt(i + 2);
int cnn = url.charAt(i+2);
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) { if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
s.appendCodePoint(c); s.appendCodePoint(c);
} } else {
else {
s.append("%25"); s.append("%25");
} }
} } else {
else {
s.append(String.format("%%%02X", c)); s.append(String.format("%%%02X", c));
} }
} }
@ -109,7 +101,7 @@ public class EdgeUrl implements Serializable {
if (colonIdx < 0 || colonIdx + 2 >= url.length()) { if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
throw new URISyntaxException(url, "Lacking protocol"); throw new URISyntaxException(url, "Lacking protocol");
} }
return url.indexOf('/', colonIdx+2); return url.indexOf('/', colonIdx + 2);
} }
public EdgeUrl(URI URI) { public EdgeUrl(URI URI) {
@ -125,8 +117,7 @@ public class EdgeUrl implements Serializable {
this.proto = URI.getScheme().toLowerCase(); this.proto = URI.getScheme().toLowerCase();
this.port = port(URI.getPort(), proto); this.port = port(URI.getPort(), proto);
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery()); this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
} } catch (Exception ex) {
catch (Exception ex) {
System.err.println("Failed to parse " + URI); System.err.println("Failed to parse " + URI);
throw ex; throw ex;
} }
@ -145,8 +136,7 @@ public class EdgeUrl implements Serializable {
this.proto = URL.getProtocol().toLowerCase(); this.proto = URL.getProtocol().toLowerCase();
this.port = port(URL.getPort(), proto); this.port = port(URL.getPort(), proto);
this.param = QueryParams.queryParamsSanitizer(this.path, URL.getQuery()); this.param = QueryParams.queryParamsSanitizer(this.path, URL.getQuery());
} } catch (Exception ex) {
catch (Exception ex) {
System.err.println("Failed to parse " + URL); System.err.println("Failed to parse " + URL);
throw ex; throw ex;
} }
@ -158,13 +148,16 @@ public class EdgeUrl implements Serializable {
} }
if (protocol.equals("http") && port == 80) { if (protocol.equals("http") && port == 80) {
return null; return null;
} } else if (protocol.equals("https") && port == 443) {
else if (protocol.equals("https") && port == 443) {
return null; return null;
} }
return port; return port;
} }
public static EdgeUrlBuilder builder() {
return new EdgeUrlBuilder();
}
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(256); StringBuilder sb = new StringBuilder(256);
@ -190,12 +183,13 @@ public class EdgeUrl implements Serializable {
public String dir() { public String dir() {
return path.replaceAll("/[^/]+$", "/"); return path.replaceAll("/[^/]+$", "/");
} }
public String fileName() { public String fileName() {
return path.replaceAll(".*/", ""); return path.replaceAll(".*/", "");
} }
public int depth() { public int depth() {
return (int) path.chars().filter(c -> c=='/').count(); return (int) path.chars().filter(c -> c == '/').count();
} }
public EdgeUrl withPathAndParam(String path, String param) { public EdgeUrl withPathAndParam(String path, String param) {
@ -207,8 +201,8 @@ public class EdgeUrl implements Serializable {
if (other == this) return true; if (other == this) return true;
if (other instanceof EdgeUrl e) { if (other instanceof EdgeUrl e) {
return Objects.equals(e.domain, domain) return Objects.equals(e.domain, domain)
&& Objects.equals(e.path, path) && Objects.equals(e.path, path)
&& Objects.equals(e.param, param); && Objects.equals(e.param, param);
} }
return true; return true;
@ -235,8 +229,7 @@ public class EdgeUrl implements Serializable {
public URL asURL() throws MalformedURLException { public URL asURL() throws MalformedURLException {
try { try {
return asURI().toURL(); return asURI().toURL();
} } catch (URISyntaxException e) {
catch (URISyntaxException e) {
throw new MalformedURLException(e.getMessage()); throw new MalformedURLException(e.getMessage());
} }
} }
@ -248,4 +241,68 @@ public class EdgeUrl implements Serializable {
return new URI(this.proto, this.domain.toString(), this.path, this.param, null); return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
} }
public String getProto() {
return this.proto;
}
public EdgeDomain getDomain() {
return this.domain;
}
public Integer getPort() {
return this.port;
}
public String getPath() {
return this.path;
}
public String getParam() {
return this.param;
}
public static class EdgeUrlBuilder {
private String proto;
private EdgeDomain domain;
private Integer port;
private String path;
private String param;
EdgeUrlBuilder() {
}
public EdgeUrlBuilder proto(String proto) {
this.proto = proto;
return this;
}
public EdgeUrlBuilder domain(EdgeDomain domain) {
this.domain = domain;
return this;
}
public EdgeUrlBuilder port(Integer port) {
this.port = port;
return this;
}
public EdgeUrlBuilder path(String path) {
this.path = path;
return this;
}
public EdgeUrlBuilder param(String param) {
this.param = param;
return this;
}
public EdgeUrl build() {
return new EdgeUrl(this.proto, this.domain, this.port, this.path, this.param);
}
public String toString() {
return "EdgeUrl.EdgeUrlBuilder(proto=" + this.proto + ", domain=" + this.domain + ", port=" + this.port + ", path=" + this.path + ", param=" + this.param + ")";
}
}
} }

View File

@ -1,8 +1,8 @@
package nu.marginalia.process.log; package nu.marginalia.process.log;
import lombok.SneakyThrows;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.util.Iterator; import java.util.Iterator;
@ -21,32 +21,36 @@ class WorkLoadIterable<T> implements Iterable<T> {
@NotNull @NotNull
@Override @Override
@SneakyThrows
public Iterator<T> iterator() { public Iterator<T> iterator() {
var stream = Files.lines(logFile); try {
return new Iterator<>() { var stream = Files.lines(logFile);
final Iterator<T> iter = stream return new Iterator<>() {
.filter(WorkLogEntry::isJobId) final Iterator<T> iter = stream
.map(WorkLogEntry::parse) .filter(WorkLogEntry::isJobId)
.map(mapper) .map(WorkLogEntry::parse)
.filter(Optional::isPresent) .map(mapper)
.map(Optional::get) .filter(Optional::isPresent)
.iterator(); .map(Optional::get)
.iterator();
@Override @Override
public boolean hasNext() { public boolean hasNext() {
if (iter.hasNext()) { if (iter.hasNext()) {
return true; return true;
} else { } else {
stream.close(); stream.close();
return false; return false;
}
} }
}
@Override @Override
public T next() { public T next() {
return iter.next(); return iter.next();
} }
}; };
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
} }
} }

View File

@ -4,12 +4,12 @@ import com.github.jknack.handlebars.*;
import com.github.jknack.handlebars.helper.ConditionalHelpers; import com.github.jknack.handlebars.helper.ConditionalHelpers;
import com.github.jknack.handlebars.io.ClassPathTemplateLoader; import com.github.jknack.handlebars.io.ClassPathTemplateLoader;
import com.github.jknack.handlebars.io.TemplateLoader; import com.github.jknack.handlebars.io.TemplateLoader;
import lombok.SneakyThrows;
import nu.marginalia.renderer.config.HandlebarsConfigurator; import nu.marginalia.renderer.config.HandlebarsConfigurator;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.*; import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -42,22 +42,35 @@ public class MustacheRenderer<T> {
} }
} }
@SneakyThrows
public String render(T model) { public String render(T model) {
return template.apply(model); try {
return template.apply(model);
}
catch (IOException ex) {
throw new RuntimeException("Failed to render template", ex);
}
} }
@SneakyThrows
public <T2> String render(T model, String name, List<T2> children) { public <T2> String render(T model, String name, List<T2> children) {
Context ctx = Context.newBuilder(model).combine(name, children).build(); Context ctx = Context.newBuilder(model).combine(name, children).build();
return template.apply(ctx); try {
return template.apply(ctx);
}
catch (IOException ex) {
throw new RuntimeException("Failed to render template", ex);
}
} }
@SneakyThrows
public String render(T model, Map<String, ?> children) { public String render(T model, Map<String, ?> children) {
Context ctx = Context.newBuilder(model).combine(children).build(); Context ctx = Context.newBuilder(model).combine(children).build();
return template.apply(ctx);
try {
return template.apply(ctx);
}
catch (IOException ex) {
throw new RuntimeException("Failed to render template", ex);
}
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.service;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -26,7 +25,6 @@ public class NodeConfigurationWatcher {
watcherThread.start(); watcherThread.start();
} }
@SneakyThrows
private void pollConfiguration() { private void pollConfiguration() {
for (;;) { for (;;) {
List<Integer> goodNodes = new ArrayList<>(); List<Integer> goodNodes = new ArrayList<>();
@ -34,7 +32,7 @@ public class NodeConfigurationWatcher {
try (var conn = dataSource.getConnection()) { try (var conn = dataSource.getConnection()) {
var stmt = conn.prepareStatement(""" var stmt = conn.prepareStatement("""
SELECT ID FROM NODE_CONFIGURATION SELECT ID FROM NODE_CONFIGURATION
WHERE ACCEPT_QUERIES AND NOT DISABLED WHERE ACCEPT_QUERIES AND NOT DISABLED
"""); """);
var rs = stmt.executeQuery(); var rs = stmt.executeQuery();
while (rs.next()) { while (rs.next()) {
@ -47,7 +45,12 @@ public class NodeConfigurationWatcher {
queryNodes = goodNodes; queryNodes = goodNodes;
TimeUnit.SECONDS.sleep(10); try {
TimeUnit.SECONDS.sleep(10);
}
catch (InterruptedException ex) {
return;
}
} }
} }

View File

@ -4,13 +4,13 @@ import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import io.grpc.ManagedChannel; import io.grpc.ManagedChannel;
import io.grpc.ManagedChannelBuilder; import io.grpc.ManagedChannelBuilder;
import nu.marginalia.util.NamedExecutorFactory;
import nu.marginalia.service.NodeConfigurationWatcher; import nu.marginalia.service.NodeConfigurationWatcher;
import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.property.PartitionTraits; import nu.marginalia.service.discovery.property.PartitionTraits;
import nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress; import nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.util.NamedExecutorFactory;
import java.util.concurrent.Executor; import java.util.concurrent.Executor;
import java.util.function.Function; import java.util.function.Function;
@ -48,7 +48,12 @@ public class GrpcChannelPoolFactory {
public <STUB> GrpcSingleNodeChannelPool<STUB> createSingle(ServiceKey<? extends PartitionTraits.Unicast> key, public <STUB> GrpcSingleNodeChannelPool<STUB> createSingle(ServiceKey<? extends PartitionTraits.Unicast> key,
Function<ManagedChannel, STUB> stubConstructor) Function<ManagedChannel, STUB> stubConstructor)
{ {
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor); try {
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
}
catch (Exception e) {
throw new RuntimeException(e);
}
} }
private ManagedChannel createChannel(InstanceAddress route) { private ManagedChannel createChannel(InstanceAddress route) {

View File

@ -1,7 +1,6 @@
package nu.marginalia.service.client; package nu.marginalia.service.client;
import io.grpc.ManagedChannel; import io.grpc.ManagedChannel;
import lombok.SneakyThrows;
import nu.marginalia.service.NodeConfigurationWatcher; import nu.marginalia.service.NodeConfigurationWatcher;
import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.property.PartitionTraits; import nu.marginalia.service.discovery.property.PartitionTraits;
@ -12,7 +11,10 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.List; import java.util.List;
import java.util.concurrent.*; import java.util.concurrent.CompletableFuture;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executor;
import java.util.concurrent.ExecutorService;
import java.util.function.BiFunction; import java.util.function.BiFunction;
import java.util.function.Function; import java.util.function.Function;
@ -29,7 +31,6 @@ public class GrpcMultiNodeChannelPool<STUB> {
private final Function<ManagedChannel, STUB> stubConstructor; private final Function<ManagedChannel, STUB> stubConstructor;
private final NodeConfigurationWatcher nodeConfigurationWatcher; private final NodeConfigurationWatcher nodeConfigurationWatcher;
@SneakyThrows
public GrpcMultiNodeChannelPool(ServiceRegistryIf serviceRegistryIf, public GrpcMultiNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
ServiceKey<ServicePartition.Multi> serviceKey, ServiceKey<ServicePartition.Multi> serviceKey,
Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor, Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor,
@ -52,11 +53,16 @@ public class GrpcMultiNodeChannelPool<STUB> {
} }
private GrpcSingleNodeChannelPool<STUB> newSingleChannelPool(int node) { private GrpcSingleNodeChannelPool<STUB> newSingleChannelPool(int node) {
return new GrpcSingleNodeChannelPool<>( try {
serviceRegistryIf, return new GrpcSingleNodeChannelPool<>(
serviceKey.forPartition(ServicePartition.partition(node)), serviceRegistryIf,
channelConstructor, serviceKey.forPartition(ServicePartition.partition(node)),
stubConstructor); channelConstructor,
stubConstructor);
}
catch (Exception e) {
throw new RuntimeException(e);
}
} }
/** Get the list of nodes that are eligible for broadcast-style requests */ /** Get the list of nodes that are eligible for broadcast-style requests */

View File

@ -2,7 +2,6 @@ package nu.marginalia.service.client;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import io.grpc.ManagedChannel; import io.grpc.ManagedChannel;
import lombok.SneakyThrows;
import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor; import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
import nu.marginalia.service.discovery.property.PartitionTraits; import nu.marginalia.service.discovery.property.PartitionTraits;
@ -34,11 +33,12 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
private final Function<ManagedChannel, STUB> stubConstructor; private final Function<ManagedChannel, STUB> stubConstructor;
@SneakyThrows
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf, public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
ServiceKey<? extends PartitionTraits.Unicast> serviceKey, ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
Function<InstanceAddress, ManagedChannel> channelConstructor, Function<InstanceAddress, ManagedChannel> channelConstructor,
Function<ManagedChannel, STUB> stubConstructor) { Function<ManagedChannel, STUB> stubConstructor)
throws Exception
{
super(serviceKey); super(serviceKey);
this.serviceRegistryIf = serviceRegistryIf; this.serviceRegistryIf = serviceRegistryIf;

View File

@ -2,7 +2,6 @@ package nu.marginalia.service.discovery;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf; import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
import nu.marginalia.service.discovery.property.ServiceEndpoint; import nu.marginalia.service.discovery.property.ServiceEndpoint;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
@ -40,18 +39,22 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
private final List<String> livenessPaths = new ArrayList<>(); private final List<String> livenessPaths = new ArrayList<>();
@Inject @Inject
@SneakyThrows
public ZkServiceRegistry(CuratorFramework curatorFramework) { public ZkServiceRegistry(CuratorFramework curatorFramework) {
this.curatorFramework = curatorFramework; try {
this.curatorFramework = curatorFramework;
curatorFramework.start(); curatorFramework.start();
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) { if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
throw new IllegalStateException("Failed to connect to zookeeper after 30s"); throw new IllegalStateException("Failed to connect to zookeeper after 30s");
}
Runtime.getRuntime().addShutdownHook(
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
);
}
catch (Exception ex) {
throw new RuntimeException("Failed to start ZkServiceRegistry", ex);
} }
Runtime.getRuntime().addShutdownHook(
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
);
} }
@Override @Override
@ -75,14 +78,18 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
return endpoint; return endpoint;
} }
@SneakyThrows
@Override @Override
public void declareFirstBoot() { public void declareFirstBoot() {
if (!isFirstBoot()) { if (!isFirstBoot()) {
curatorFramework.create() try {
.creatingParentsIfNeeded() curatorFramework.create()
.withMode(CreateMode.PERSISTENT) .creatingParentsIfNeeded()
.forPath("/first-boot"); .withMode(CreateMode.PERSISTENT)
.forPath("/first-boot");
}
catch (Exception ex) {
logger.error("Failed to declare first-boot", ex);
}
} }
} }

View File

@ -5,14 +5,12 @@ import com.google.inject.Provides;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariConfig; import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import org.flywaydb.core.Flyway; import org.flywaydb.core.Flyway;
import org.mariadb.jdbc.Driver; import org.mariadb.jdbc.Driver;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import javax.sql.DataSource;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
@ -71,14 +69,12 @@ public class DatabaseModule extends AbstractModule {
} }
@SneakyThrows
@Singleton @Singleton
@Provides @Provides
public HikariDataSource provideConnection() { public HikariDataSource provideConnection() {
return getMariaDB(); return getMariaDB();
} }
@SneakyThrows
private HikariDataSource getMariaDB() { private HikariDataSource getMariaDB() {
var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY)); var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY));

View File

@ -1,7 +1,6 @@
package nu.marginalia.service.server; package nu.marginalia.service.server;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.SneakyThrows;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -59,13 +58,17 @@ public class Initialization {
} }
} }
@SneakyThrows
public boolean waitReady() { public boolean waitReady() {
synchronized (this) { try {
while (!initialized) { synchronized (this) {
wait(); while (!initialized) {
wait();
}
return true;
} }
return true; }
catch (InterruptedException ex) {
throw new RuntimeException("Interrupted while waiting for initialization", ex);
} }
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.service.server;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.prometheus.client.exporter.MetricsServlet; import io.prometheus.client.exporter.MetricsServlet;
import lombok.SneakyThrows;
import nu.marginalia.service.module.ServiceConfiguration; import nu.marginalia.service.module.ServiceConfiguration;
import org.eclipse.jetty.server.Server; import org.eclipse.jetty.server.Server;
import org.eclipse.jetty.servlet.ServletContextHandler; import org.eclipse.jetty.servlet.ServletContextHandler;
@ -10,9 +9,8 @@ import org.eclipse.jetty.servlet.ServletHolder;
public class MetricsServer { public class MetricsServer {
@SneakyThrows
@Inject @Inject
public MetricsServer(ServiceConfiguration configuration) { public MetricsServer(ServiceConfiguration configuration) throws Exception {
// If less than zero, we forego setting up a metrics server // If less than zero, we forego setting up a metrics server
if (configuration.metricsPort() < 0) if (configuration.metricsPort() < 0)
return; return;

View File

@ -1,8 +1,7 @@
package nu.marginalia.service.server; package nu.marginalia.service.server;
import com.google.inject.name.Named;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows; import com.google.inject.name.Named;
import nu.marginalia.mq.persistence.MqPersistence; import nu.marginalia.mq.persistence.MqPersistence;
import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.NodeConfigurationService;
import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.FileStorageService;
@ -81,10 +80,14 @@ public class NodeStatusWatcher {
} }
} }
@SneakyThrows
private boolean isConfigured() { private boolean isConfigured() {
var configuration = configurationService.get(nodeId); try {
return configuration != null; var configuration = configurationService.get(nodeId);
return configuration != null;
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
} }
/** Look for changes in the configuration and kill the service if the corresponding /** Look for changes in the configuration and kill the service if the corresponding

View File

@ -1,7 +1,6 @@
package nu.marginalia.service.server; package nu.marginalia.service.server;
import io.prometheus.client.Counter; import io.prometheus.client.Counter;
import lombok.SneakyThrows;
import nu.marginalia.mq.inbox.MqInboxIf; import nu.marginalia.mq.inbox.MqInboxIf;
import nu.marginalia.service.client.ServiceNotAvailableException; import nu.marginalia.service.client.ServiceNotAvailableException;
import nu.marginalia.service.discovery.property.ServiceKey; import nu.marginalia.service.discovery.property.ServiceKey;
@ -44,11 +43,10 @@ public class Service {
private final int node; private final int node;
private GrpcServer grpcServer; private GrpcServer grpcServer;
@SneakyThrows
public Service(BaseServiceParams params, public Service(BaseServiceParams params,
Runnable configureStaticFiles, Runnable configureStaticFiles,
ServicePartition partition, ServicePartition partition,
List<DiscoverableService> grpcServices) { List<DiscoverableService> grpcServices) throws Exception {
this.initialization = params.initialization; this.initialization = params.initialization;
var config = params.configuration; var config = params.configuration;
@ -130,14 +128,14 @@ public class Service {
public Service(BaseServiceParams params, public Service(BaseServiceParams params,
ServicePartition partition, ServicePartition partition,
List<DiscoverableService> grpcServices) { List<DiscoverableService> grpcServices) throws Exception {
this(params, this(params,
Service::defaultSparkConfig, Service::defaultSparkConfig,
partition, partition,
grpcServices); grpcServices);
} }
public Service(BaseServiceParams params) { public Service(BaseServiceParams params) throws Exception {
this(params, this(params,
Service::defaultSparkConfig, Service::defaultSparkConfig,
ServicePartition.any(), ServicePartition.any(),

View File

@ -1,20 +1,18 @@
package nu.marginalia.service.server; package nu.marginalia.service.server;
import lombok.SneakyThrows;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
import spark.Spark; import spark.Spark;
import spark.resource.ClassPathResource; import spark.resource.ClassPathResource;
import spark.staticfiles.MimeType; import spark.staticfiles.MimeType;
import java.io.FileNotFoundException; import java.io.IOException;
import java.time.LocalDateTime; import java.time.LocalDateTime;
import java.time.ZoneOffset; import java.time.ZoneOffset;
public class StaticResources { public class StaticResources {
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC); private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
@SneakyThrows
public void serveStatic(String domain, String path, Request req, Response rsp) { public void serveStatic(String domain, String path, Request req, Response rsp) {
try { try {
if (path.startsWith("..") || domain.startsWith("..")) { if (path.startsWith("..") || domain.startsWith("..")) {
@ -28,7 +26,7 @@ public class StaticResources {
resource.getInputStream().transferTo(rsp.raw().getOutputStream()); resource.getInputStream().transferTo(rsp.raw().getOutputStream());
} }
catch (IllegalArgumentException | FileNotFoundException ex) { catch (IllegalArgumentException | IOException ex) {
Spark.halt(404); Spark.halt(404);
} }
} }
@ -57,7 +55,6 @@ public class StaticResources {
return "application/octet-stream"; return "application/octet-stream";
} }
@SneakyThrows
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) { private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
rsp.header("Cache-Control", "public,max-age=3600"); rsp.header("Cache-Control", "public,max-age=3600");
rsp.type(MimeType.fromResource(resource)); rsp.type(MimeType.fromResource(resource));

View File

@ -2,7 +2,6 @@ package nu.marginalia.extractor;
import com.google.inject.Inject; import com.google.inject.Inject;
import gnu.trove.set.hash.TLongHashSet; import gnu.trove.set.hash.TLongHashSet;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.io.CrawledDomainReader; import nu.marginalia.io.CrawledDomainReader;
import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.io.SerializableCrawlDataStream;
@ -101,10 +100,14 @@ public class AtagExporter implements ExporterIf {
continue; continue;
} }
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag); var linkOpt = linkParser
linkOpt .parseLinkPermissive(baseUrl, atag)
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText)) .filter(url -> linkFilter.isEligible(url, baseUrl, linkText));
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
if (linkOpt.isPresent()) {
var url = linkOpt.get();
exporter.accept(url, baseUrl.domain, linkText);
}
} }
} }
@ -167,8 +170,7 @@ public class AtagExporter implements ExporterIf {
this.writer = writer; this.writer = writer;
} }
@SneakyThrows public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) throws IOException {
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
final String urlString = urlWithNoSchema(url); final String urlString = urlWithNoSchema(url);
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",

View File

@ -1,7 +1,6 @@
package nu.marginalia.extractor; package nu.marginalia.extractor;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.io.CrawledDomainReader; import nu.marginalia.io.CrawledDomainReader;
import nu.marginalia.io.SerializableCrawlDataStream; import nu.marginalia.io.SerializableCrawlDataStream;
import nu.marginalia.link_parser.FeedExtractor; import nu.marginalia.link_parser.FeedExtractor;
@ -115,12 +114,16 @@ public class FeedExporter implements ExporterIf {
this.writer = writer; this.writer = writer;
} }
@SneakyThrows
public void accept(EdgeDomain domain, int size, EdgeUrl path) { public void accept(EdgeDomain domain, int size, EdgeUrl path) {
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n", try {
csvify(domain), writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
csvify(size), csvify(domain),
csvify(path))); csvify(size),
csvify(path)));
}
catch (IOException ex) {
throw new RuntimeException(ex);
}
} }
private static String csvify(Object field) { private static String csvify(Object field) {

View File

@ -2,7 +2,6 @@ package nu.marginalia.actor;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.actor.monitor.FileStorageMonitorActor; import nu.marginalia.actor.monitor.FileStorageMonitorActor;
import nu.marginalia.actor.proc.*; import nu.marginalia.actor.proc.*;
import nu.marginalia.actor.prototype.ActorPrototype; import nu.marginalia.actor.prototype.ActorPrototype;
@ -13,6 +12,8 @@ import nu.marginalia.actor.task.*;
import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.service.control.ServiceEventLog; import nu.marginalia.service.control.ServiceEventLog;
import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.BaseServiceParams;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;
@ -27,6 +28,8 @@ public class ExecutorActorControlService {
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>(); public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
private final int node; private final int node;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject @Inject
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory, public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
BaseServiceParams baseServiceParams, BaseServiceParams baseServiceParams,
@ -119,11 +122,15 @@ public class ExecutorActorControlService {
stateMachines.startFromJSON(process, state, json); stateMachines.startFromJSON(process, state, json);
} }
@SneakyThrows
public void stop(ExecutorActor process) { public void stop(ExecutorActor process) {
eventLog.logEvent("FSM-STOP", process.id()); eventLog.logEvent("FSM-STOP", process.id());
stateMachines.stop(process); try {
stateMachines.stop(process);
}
catch (Exception e) {
logger.error("Failed to stop FSM", e);
}
} }
public Map<ExecutorActor, ActorStateInstance> getActorStates() { public Map<ExecutorActor, ActorStateInstance> getActorStates() {

View File

@ -3,9 +3,6 @@ package nu.marginalia.actor.task;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.AllArgsConstructor;
import lombok.NoArgsConstructor;
import lombok.With;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.prototype.RecordActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorResumeBehavior;
@ -40,7 +37,6 @@ import java.util.List;
public class ConvertAndLoadActor extends RecordActorPrototype { public class ConvertAndLoadActor extends RecordActorPrototype {
// STATES // STATES
public static final String RERANK = "RERANK"; public static final String RERANK = "RERANK";
private final ActorProcessWatcher processWatcher; private final ActorProcessWatcher processWatcher;
private final MqOutbox mqConverterOutbox; private final MqOutbox mqConverterOutbox;
@ -54,15 +50,6 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
private final int nodeId; private final int nodeId;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
@AllArgsConstructor @With @NoArgsConstructor
public static class Message {
public FileStorageId crawlStorageId = null;
public List<FileStorageId> processedStorageId = null;
public long converterMsgId = 0L;
public long loaderMsgId = 0L;
}
public record Initial(FileStorageId fid) implements ActorStep {} public record Initial(FileStorageId fid) implements ActorStep {}
@Resume(behavior = ActorResumeBehavior.RETRY) @Resume(behavior = ActorResumeBehavior.RETRY)

View File

@ -2,7 +2,6 @@ package nu.marginalia.execution;
import com.google.inject.Inject; import com.google.inject.Inject;
import io.grpc.stub.StreamObserver; import io.grpc.stub.StreamObserver;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.actor.ActorApi; import nu.marginalia.actor.ActorApi;
import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActor;
@ -228,13 +227,17 @@ public class ExecutorGrpcService
} }
} }
@SneakyThrows
private RpcFileStorageEntry createFileModel(Path path) { private RpcFileStorageEntry createFileModel(Path path) {
return RpcFileStorageEntry.newBuilder() try {
.setName(path.toFile().getName()) return RpcFileStorageEntry.newBuilder()
.setSize(Files.size(path)) .setName(path.toFile().getName())
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString()) .setSize(Files.size(path))
.build(); .setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
.build();
}
catch (Exception e) {
throw new RuntimeException(e);
}
} }

View File

@ -3,7 +3,6 @@ package nu.marginalia.screenshot;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.db.DbDomainQueries; import nu.marginalia.db.DbDomainQueries;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -11,6 +10,7 @@ import org.slf4j.LoggerFactory;
import spark.Request; import spark.Request;
import spark.Response; import spark.Response;
import java.io.IOException;
import java.sql.SQLException; import java.sql.SQLException;
import static java.lang.Integer.parseInt; import static java.lang.Integer.parseInt;
@ -48,7 +48,6 @@ public class ScreenshotService {
return false; return false;
} }
@SneakyThrows
public Object serveScreenshotRequest(Request request, Response response) { public Object serveScreenshotRequest(Request request, Response response) {
if (Strings.isNullOrEmpty(request.params("id"))) { if (Strings.isNullOrEmpty(request.params("id"))) {
response.redirect("https://search.marginalia.nu/"); response.redirect("https://search.marginalia.nu/");
@ -75,6 +74,9 @@ public class ScreenshotService {
return ""; return "";
} }
} }
catch (IOException ex) {
logger.warn("IO error", ex);
}
catch (SQLException ex) { catch (SQLException ex) {
logger.warn("SQL error", ex); logger.warn("SQL error", ex);
} }

View File

@ -1,9 +1,9 @@
package nu.marginalia.api.domains; package nu.marginalia.api.domains;
import lombok.SneakyThrows; import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.api.domains.model.*;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -28,18 +28,22 @@ public class DomainsProtobufCodec {
return ret; return ret;
} }
@SneakyThrows
private static SimilarDomain convertResponseEntry(RpcSimilarDomain sd) { private static SimilarDomain convertResponseEntry(RpcSimilarDomain sd) {
return new SimilarDomain( try {
new EdgeUrl(sd.getUrl()), return new SimilarDomain(
sd.getDomainId(), new EdgeUrl(sd.getUrl()),
sd.getRelatedness(), sd.getDomainId(),
sd.getRank(), sd.getRelatedness(),
sd.getIndexed(), sd.getRank(),
sd.getActive(), sd.getIndexed(),
sd.getScreenshot(), sd.getActive(),
SimilarDomain.LinkType.valueOf(sd.getLinkType().name()) sd.getScreenshot(),
); SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
);
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
} }
} }

View File

@ -1,10 +1,7 @@
package nu.marginalia.api.domains.model; package nu.marginalia.api.domains.model;
import lombok.*;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
@ToString
public class DomainInformation { public class DomainInformation {
EdgeDomain domain; EdgeDomain domain;
@ -29,6 +26,34 @@ public class DomainInformation {
String ipCountry; String ipCountry;
String state; String state;
public DomainInformation(EdgeDomain domain, boolean blacklisted, int pagesKnown, int pagesFetched, int pagesIndexed, int incomingLinks, int outboundLinks, int nodeAffinity, double ranking, boolean suggestForCrawling, boolean inCrawlQueue, boolean unknownDomain, String ip, Integer asn, String asnOrg, String asnCountry, String ipCountry, String state) {
this.domain = domain;
this.blacklisted = blacklisted;
this.pagesKnown = pagesKnown;
this.pagesFetched = pagesFetched;
this.pagesIndexed = pagesIndexed;
this.incomingLinks = incomingLinks;
this.outboundLinks = outboundLinks;
this.nodeAffinity = nodeAffinity;
this.ranking = ranking;
this.suggestForCrawling = suggestForCrawling;
this.inCrawlQueue = inCrawlQueue;
this.unknownDomain = unknownDomain;
this.ip = ip;
this.asn = asn;
this.asnOrg = asnOrg;
this.asnCountry = asnCountry;
this.ipCountry = ipCountry;
this.state = state;
}
public DomainInformation() {
}
public static DomainInformationBuilder builder() {
return new DomainInformationBuilder();
}
public String getIpFlag() { public String getIpFlag() {
if (ipCountry == null || ipCountry.codePointCount(0, ipCountry.length()) != 2) { if (ipCountry == null || ipCountry.codePointCount(0, ipCountry.length()) != 2) {
return ""; return "";
@ -45,4 +70,202 @@ public class DomainInformation {
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset; int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar)); return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
} }
public EdgeDomain getDomain() {
return this.domain;
}
public boolean isBlacklisted() {
return this.blacklisted;
}
public int getPagesKnown() {
return this.pagesKnown;
}
public int getPagesFetched() {
return this.pagesFetched;
}
public int getPagesIndexed() {
return this.pagesIndexed;
}
public int getIncomingLinks() {
return this.incomingLinks;
}
public int getOutboundLinks() {
return this.outboundLinks;
}
public int getNodeAffinity() {
return this.nodeAffinity;
}
public double getRanking() {
return this.ranking;
}
public boolean isSuggestForCrawling() {
return this.suggestForCrawling;
}
public boolean isInCrawlQueue() {
return this.inCrawlQueue;
}
public boolean isUnknownDomain() {
return this.unknownDomain;
}
public String getIp() {
return this.ip;
}
public Integer getAsn() {
return this.asn;
}
public String getAsnOrg() {
return this.asnOrg;
}
public String getAsnCountry() {
return this.asnCountry;
}
public String getIpCountry() {
return this.ipCountry;
}
public String getState() {
return this.state;
}
public String toString() {
return "DomainInformation(domain=" + this.getDomain() + ", blacklisted=" + this.isBlacklisted() + ", pagesKnown=" + this.getPagesKnown() + ", pagesFetched=" + this.getPagesFetched() + ", pagesIndexed=" + this.getPagesIndexed() + ", incomingLinks=" + this.getIncomingLinks() + ", outboundLinks=" + this.getOutboundLinks() + ", nodeAffinity=" + this.getNodeAffinity() + ", ranking=" + this.getRanking() + ", suggestForCrawling=" + this.isSuggestForCrawling() + ", inCrawlQueue=" + this.isInCrawlQueue() + ", unknownDomain=" + this.isUnknownDomain() + ", ip=" + this.getIp() + ", asn=" + this.getAsn() + ", asnOrg=" + this.getAsnOrg() + ", asnCountry=" + this.getAsnCountry() + ", ipCountry=" + this.getIpCountry() + ", state=" + this.getState() + ")";
}
public static class DomainInformationBuilder {
private EdgeDomain domain;
private boolean blacklisted;
private int pagesKnown;
private int pagesFetched;
private int pagesIndexed;
private int incomingLinks;
private int outboundLinks;
private int nodeAffinity;
private double ranking;
private boolean suggestForCrawling;
private boolean inCrawlQueue;
private boolean unknownDomain;
private String ip;
private Integer asn;
private String asnOrg;
private String asnCountry;
private String ipCountry;
private String state;
DomainInformationBuilder() {
}
public DomainInformationBuilder domain(EdgeDomain domain) {
this.domain = domain;
return this;
}
public DomainInformationBuilder blacklisted(boolean blacklisted) {
this.blacklisted = blacklisted;
return this;
}
public DomainInformationBuilder pagesKnown(int pagesKnown) {
this.pagesKnown = pagesKnown;
return this;
}
public DomainInformationBuilder pagesFetched(int pagesFetched) {
this.pagesFetched = pagesFetched;
return this;
}
public DomainInformationBuilder pagesIndexed(int pagesIndexed) {
this.pagesIndexed = pagesIndexed;
return this;
}
public DomainInformationBuilder incomingLinks(int incomingLinks) {
this.incomingLinks = incomingLinks;
return this;
}
public DomainInformationBuilder outboundLinks(int outboundLinks) {
this.outboundLinks = outboundLinks;
return this;
}
public DomainInformationBuilder nodeAffinity(int nodeAffinity) {
this.nodeAffinity = nodeAffinity;
return this;
}
public DomainInformationBuilder ranking(double ranking) {
this.ranking = ranking;
return this;
}
public DomainInformationBuilder suggestForCrawling(boolean suggestForCrawling) {
this.suggestForCrawling = suggestForCrawling;
return this;
}
public DomainInformationBuilder inCrawlQueue(boolean inCrawlQueue) {
this.inCrawlQueue = inCrawlQueue;
return this;
}
public DomainInformationBuilder unknownDomain(boolean unknownDomain) {
this.unknownDomain = unknownDomain;
return this;
}
public DomainInformationBuilder ip(String ip) {
this.ip = ip;
return this;
}
public DomainInformationBuilder asn(Integer asn) {
this.asn = asn;
return this;
}
public DomainInformationBuilder asnOrg(String asnOrg) {
this.asnOrg = asnOrg;
return this;
}
public DomainInformationBuilder asnCountry(String asnCountry) {
this.asnCountry = asnCountry;
return this;
}
public DomainInformationBuilder ipCountry(String ipCountry) {
this.ipCountry = ipCountry;
return this;
}
public DomainInformationBuilder state(String state) {
this.state = state;
return this;
}
public DomainInformation build() {
return new DomainInformation(this.domain, this.blacklisted, this.pagesKnown, this.pagesFetched, this.pagesIndexed, this.incomingLinks, this.outboundLinks, this.nodeAffinity, this.ranking, this.suggestForCrawling, this.inCrawlQueue, this.unknownDomain, this.ip, this.asn, this.asnOrg, this.asnCountry, this.ipCountry, this.state);
}
public String toString() {
return "DomainInformation.DomainInformationBuilder(domain=" + this.domain + ", blacklisted=" + this.blacklisted + ", pagesKnown=" + this.pagesKnown + ", pagesFetched=" + this.pagesFetched + ", pagesIndexed=" + this.pagesIndexed + ", incomingLinks=" + this.incomingLinks + ", outboundLinks=" + this.outboundLinks + ", nodeAffinity=" + this.nodeAffinity + ", ranking=" + this.ranking + ", suggestForCrawling=" + this.suggestForCrawling + ", inCrawlQueue=" + this.inCrawlQueue + ", unknownDomain=" + this.unknownDomain + ", ip=" + this.ip + ", asn=" + this.asn + ", asnOrg=" + this.asnOrg + ", asnCountry=" + this.asnCountry + ", ipCountry=" + this.ipCountry + ", state=" + this.state + ")";
}
}
} }

View File

@ -1,14 +1,29 @@
package nu.marginalia.api.math.model; package nu.marginalia.api.math.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.ToString;
@AllArgsConstructor
@Getter
@ToString
public class DictionaryEntry { public class DictionaryEntry {
public final String type; public final String type;
public final String word; public final String word;
public final String definition; public final String definition;
public DictionaryEntry(String type, String word, String definition) {
this.type = type;
this.word = word;
this.definition = definition;
}
public String getType() {
return this.type;
}
public String getWord() {
return this.word;
}
public String getDefinition() {
return this.definition;
}
public String toString() {
return "DictionaryEntry(type=" + this.getType() + ", word=" + this.getWord() + ", definition=" + this.getDefinition() + ")";
}
} }

View File

@ -1,14 +1,28 @@
package nu.marginalia.api.math.model; package nu.marginalia.api.math.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.ToString;
import java.util.List; import java.util.List;
@ToString @Getter @AllArgsConstructor @NoArgsConstructor
public class DictionaryResponse { public class DictionaryResponse {
public String word; public String word;
public List<DictionaryEntry> entries; public List<DictionaryEntry> entries;
public DictionaryResponse(String word, List<DictionaryEntry> entries) {
this.word = word;
this.entries = entries;
}
public DictionaryResponse() {
}
public String getWord() {
return this.word;
}
public List<DictionaryEntry> getEntries() {
return this.entries;
}
public String toString() {
return "DictionaryResponse(word=" + this.getWord() + ", entries=" + this.getEntries() + ")";
}
} }

View File

@ -1,10 +1,7 @@
package nu.marginalia.functions.math.eval; package nu.marginalia.functions.math.eval;
import lombok.AllArgsConstructor;
import lombok.SneakyThrows;
import lombok.ToString;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import java.math.RoundingMode; import java.math.RoundingMode;
import java.text.DecimalFormat; import java.text.DecimalFormat;
import java.text.NumberFormat; import java.text.NumberFormat;
@ -44,7 +41,6 @@ public class MathParser {
} }
} }
@SneakyThrows
public double eval(String inputExpression) throws ParseException { public double eval(String inputExpression) throws ParseException {
if (isTrivial.test(inputExpression)) { if (isTrivial.test(inputExpression)) {
return Double.parseDouble(inputExpression); return Double.parseDouble(inputExpression);
@ -243,10 +239,13 @@ public class MathParser {
} }
} }
@AllArgsConstructor @ToString
class Token { class Token {
public final char tokenType; public final char tokenType;
public Token(char tokenType) {
this.tokenType = tokenType;
}
public double evaluate() { public double evaluate() {
throw new IllegalArgumentException("Can't evaluate" + this); throw new IllegalArgumentException("Can't evaluate" + this);
} }
@ -254,9 +253,12 @@ class Token {
public void transform(Function<List<Token>, List<Token>> mapper) { public void transform(Function<List<Token>, List<Token>> mapper) {
} }
public String toString() {
return "Token(tokenType=" + this.tokenType + ")";
}
} }
@ToString
class StringToken extends Token { class StringToken extends Token {
public final String value; public final String value;
@ -274,6 +276,10 @@ class StringToken extends Token {
return Double.parseDouble(value); return Double.parseDouble(value);
} }
public String toString() {
return "StringToken(value=" + this.value + ")";
}
} }
class UniExpression extends Token { class UniExpression extends Token {
@ -302,7 +308,6 @@ class UniExpression extends Token {
} }
} }
@ToString
class GroupExpression extends Token { class GroupExpression extends Token {
public List<Token> argument; public List<Token> argument;
@ -323,6 +328,10 @@ class GroupExpression extends Token {
public void transform(Function<List<Token>, List<Token>> mapper) { public void transform(Function<List<Token>, List<Token>> mapper) {
argument = mapper.apply(argument); argument = mapper.apply(argument);
} }
public String toString() {
return "GroupExpression(argument=" + this.argument + ")";
}
} }

View File

@ -1,10 +1,9 @@
package nu.marginalia.functions.math.eval; package nu.marginalia.functions.math.eval;
import com.opencsv.CSVReader;
import lombok.SneakyThrows;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import com.opencsv.CSVReader;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat; import java.text.DecimalFormat;
@ -19,7 +18,6 @@ public class Units {
private final Map<String, Unit> unitsByName = new HashMap<>(); private final Map<String, Unit> unitsByName = new HashMap<>();
private final MathParser mathParser; private final MathParser mathParser;
@SneakyThrows
@Inject @Inject
public Units(MathParser mathParser) { public Units(MathParser mathParser) {
this.mathParser = mathParser; this.mathParser = mathParser;
@ -41,6 +39,9 @@ public class Units {
} }
} }
} }
catch (Exception ex) {
throw new RuntimeException(ex);
}
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.api.searchquery; package nu.marginalia.api.searchquery;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.QueryResponse; import nu.marginalia.api.searchquery.model.query.QueryResponse;
@ -128,24 +127,28 @@ public class QueryProtobufCodec {
); );
} }
@SneakyThrows
private static DecoratedSearchResultItem convertDecoratedResult(RpcDecoratedResultItem results) { private static DecoratedSearchResultItem convertDecoratedResult(RpcDecoratedResultItem results) {
return new DecoratedSearchResultItem( try {
convertRawResult(results.getRawItem()), return new DecoratedSearchResultItem(
new EdgeUrl(results.getUrl()), convertRawResult(results.getRawItem()),
results.getTitle(), new EdgeUrl(results.getUrl()),
results.getDescription(), results.getTitle(),
results.getUrlQuality(), results.getDescription(),
results.getFormat(), results.getUrlQuality(),
results.getFeatures(), results.getFormat(),
results.getPubYear(), // ??, results.getFeatures(),
results.getDataHash(), results.getPubYear(), // ??,
results.getWordsTotal(), results.getDataHash(),
results.getBestPositions(), results.getWordsTotal(),
results.getRankingScore(), results.getBestPositions(),
results.getResultsFromDomain(), results.getRankingScore(),
convertRankingDetails(results.getRankingDetails()) results.getResultsFromDomain(),
); convertRankingDetails(results.getRankingDetails())
);
}
catch (Exception ex) {
throw new RuntimeException("Failed to convert result", ex);
}
} }
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) { private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
@ -325,24 +328,28 @@ public class QueryProtobufCodec {
return builder.build(); return builder.build();
} }
@SneakyThrows
public static DecoratedSearchResultItem convertQueryResult(RpcDecoratedResultItem rpcDecoratedResultItem) { public static DecoratedSearchResultItem convertQueryResult(RpcDecoratedResultItem rpcDecoratedResultItem) {
return new DecoratedSearchResultItem( try {
convertRawResult(rpcDecoratedResultItem.getRawItem()), return new DecoratedSearchResultItem(
new EdgeUrl(rpcDecoratedResultItem.getUrl()), convertRawResult(rpcDecoratedResultItem.getRawItem()),
rpcDecoratedResultItem.getTitle(), new EdgeUrl(rpcDecoratedResultItem.getUrl()),
rpcDecoratedResultItem.getDescription(), rpcDecoratedResultItem.getTitle(),
rpcDecoratedResultItem.getUrlQuality(), rpcDecoratedResultItem.getDescription(),
rpcDecoratedResultItem.getFormat(), rpcDecoratedResultItem.getUrlQuality(),
rpcDecoratedResultItem.getFeatures(), rpcDecoratedResultItem.getFormat(),
rpcDecoratedResultItem.getPubYear(), rpcDecoratedResultItem.getFeatures(),
rpcDecoratedResultItem.getDataHash(), rpcDecoratedResultItem.getPubYear(),
rpcDecoratedResultItem.getWordsTotal(), rpcDecoratedResultItem.getDataHash(),
rpcDecoratedResultItem.getBestPositions(), rpcDecoratedResultItem.getWordsTotal(),
rpcDecoratedResultItem.getRankingScore(), rpcDecoratedResultItem.getBestPositions(),
rpcDecoratedResultItem.getResultsFromDomain(), rpcDecoratedResultItem.getRankingScore(),
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails()) rpcDecoratedResultItem.getResultsFromDomain(),
); convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
);
}
catch (Exception ex) {
throw new RuntimeException("Failed to convert result", ex);
}
} }
} }

View File

@ -1,41 +1,53 @@
package nu.marginalia.api.searchquery.model.query; package nu.marginalia.api.searchquery.model.query;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import lombok.Getter;
import lombok.With;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@Getter
@AllArgsConstructor
@With
@EqualsAndHashCode
public class SearchQuery { public class SearchQuery {
/**
/** An infix style expression that encodes the required terms in the query */ * An infix style expression that encodes the required terms in the query
*/
public final String compiledQuery; public final String compiledQuery;
/** All terms that appear in {@see compiledQuery} */ /**
* All terms that appear in {@see compiledQuery}
*/
public final List<String> searchTermsInclude; public final List<String> searchTermsInclude;
/** These terms must be absent from the document */ /**
* These terms must be absent from the document
*/
public final List<String> searchTermsExclude; public final List<String> searchTermsExclude;
/** These terms must be present in the document, but are not used in ranking */ /**
* These terms must be present in the document, but are not used in ranking
*/
public final List<String> searchTermsAdvice; public final List<String> searchTermsAdvice;
/** If these optional terms are present in the document, rank it highly */ /**
* If these optional terms are present in the document, rank it highly
*/
public final List<String> searchTermsPriority; public final List<String> searchTermsPriority;
/** Terms that we require to be in the same sentence */ /**
* Terms that we require to be in the same sentence
*/
public final List<SearchPhraseConstraint> phraseConstraints; public final List<SearchPhraseConstraint> phraseConstraints;
@Deprecated // why does this exist? @Deprecated // why does this exist?
private double value = 0; private double value = 0;
public SearchQuery(String compiledQuery, List<String> searchTermsInclude, List<String> searchTermsExclude, List<String> searchTermsAdvice, List<String> searchTermsPriority, List<SearchPhraseConstraint> phraseConstraints, double value) {
this.compiledQuery = compiledQuery;
this.searchTermsInclude = searchTermsInclude;
this.searchTermsExclude = searchTermsExclude;
this.searchTermsAdvice = searchTermsAdvice;
this.searchTermsPriority = searchTermsPriority;
this.phraseConstraints = phraseConstraints;
this.value = value;
}
public static SearchQueryBuilder builder() { public static SearchQueryBuilder builder() {
return new SearchQueryBuilder(); return new SearchQueryBuilder();
} }
@ -77,14 +89,132 @@ public class SearchQuery {
public String toString() { public String toString() {
StringBuilder sb = new StringBuilder(); StringBuilder sb = new StringBuilder();
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", "); if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", ");
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsExclude.isEmpty())
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty())
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", "))); sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
if (!searchTermsPriority.isEmpty())
sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
if (!phraseConstraints.isEmpty())
sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh -> coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
return sb.toString(); return sb.toString();
} }
public String getCompiledQuery() {
return this.compiledQuery;
}
public List<String> getSearchTermsInclude() {
return this.searchTermsInclude;
}
public List<String> getSearchTermsExclude() {
return this.searchTermsExclude;
}
public List<String> getSearchTermsAdvice() {
return this.searchTermsAdvice;
}
public List<String> getSearchTermsPriority() {
return this.searchTermsPriority;
}
public List<SearchPhraseConstraint> getPhraseConstraints() {
return this.phraseConstraints;
}
@Deprecated
public double getValue() {
return this.value;
}
public SearchQuery withCompiledQuery(String compiledQuery) {
return this.compiledQuery == compiledQuery ? this : new SearchQuery(compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
}
public SearchQuery withSearchTermsInclude(List<String> searchTermsInclude) {
return this.searchTermsInclude == searchTermsInclude ? this : new SearchQuery(this.compiledQuery, searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
}
public SearchQuery withSearchTermsExclude(List<String> searchTermsExclude) {
return this.searchTermsExclude == searchTermsExclude ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
}
public SearchQuery withSearchTermsAdvice(List<String> searchTermsAdvice) {
return this.searchTermsAdvice == searchTermsAdvice ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, this.value);
}
public SearchQuery withSearchTermsPriority(List<String> searchTermsPriority) {
return this.searchTermsPriority == searchTermsPriority ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, searchTermsPriority, this.phraseConstraints, this.value);
}
public SearchQuery withPhraseConstraints(List<SearchPhraseConstraint> phraseConstraints) {
return this.phraseConstraints == phraseConstraints ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, phraseConstraints, this.value);
}
public SearchQuery withValue(double value) {
return this.value == value ? this : new SearchQuery(this.compiledQuery, this.searchTermsInclude, this.searchTermsExclude, this.searchTermsAdvice, this.searchTermsPriority, this.phraseConstraints, value);
}
public boolean equals(final Object o) {
if (o == this) return true;
if (!(o instanceof SearchQuery)) return false;
final SearchQuery other = (SearchQuery) o;
if (!other.canEqual((Object) this)) return false;
final Object this$compiledQuery = this.getCompiledQuery();
final Object other$compiledQuery = other.getCompiledQuery();
if (this$compiledQuery == null ? other$compiledQuery != null : !this$compiledQuery.equals(other$compiledQuery))
return false;
final Object this$searchTermsInclude = this.getSearchTermsInclude();
final Object other$searchTermsInclude = other.getSearchTermsInclude();
if (this$searchTermsInclude == null ? other$searchTermsInclude != null : !this$searchTermsInclude.equals(other$searchTermsInclude))
return false;
final Object this$searchTermsExclude = this.getSearchTermsExclude();
final Object other$searchTermsExclude = other.getSearchTermsExclude();
if (this$searchTermsExclude == null ? other$searchTermsExclude != null : !this$searchTermsExclude.equals(other$searchTermsExclude))
return false;
final Object this$searchTermsAdvice = this.getSearchTermsAdvice();
final Object other$searchTermsAdvice = other.getSearchTermsAdvice();
if (this$searchTermsAdvice == null ? other$searchTermsAdvice != null : !this$searchTermsAdvice.equals(other$searchTermsAdvice))
return false;
final Object this$searchTermsPriority = this.getSearchTermsPriority();
final Object other$searchTermsPriority = other.getSearchTermsPriority();
if (this$searchTermsPriority == null ? other$searchTermsPriority != null : !this$searchTermsPriority.equals(other$searchTermsPriority))
return false;
final Object this$phraseConstraints = this.getPhraseConstraints();
final Object other$phraseConstraints = other.getPhraseConstraints();
if (this$phraseConstraints == null ? other$phraseConstraints != null : !this$phraseConstraints.equals(other$phraseConstraints))
return false;
if (Double.compare(this.getValue(), other.getValue()) != 0) return false;
return true;
}
protected boolean canEqual(final Object other) {
return other instanceof SearchQuery;
}
public int hashCode() {
final int PRIME = 59;
int result = 1;
final Object $compiledQuery = this.getCompiledQuery();
result = result * PRIME + ($compiledQuery == null ? 43 : $compiledQuery.hashCode());
final Object $searchTermsInclude = this.getSearchTermsInclude();
result = result * PRIME + ($searchTermsInclude == null ? 43 : $searchTermsInclude.hashCode());
final Object $searchTermsExclude = this.getSearchTermsExclude();
result = result * PRIME + ($searchTermsExclude == null ? 43 : $searchTermsExclude.hashCode());
final Object $searchTermsAdvice = this.getSearchTermsAdvice();
result = result * PRIME + ($searchTermsAdvice == null ? 43 : $searchTermsAdvice.hashCode());
final Object $searchTermsPriority = this.getSearchTermsPriority();
result = result * PRIME + ($searchTermsPriority == null ? 43 : $searchTermsPriority.hashCode());
final Object $phraseConstraints = this.getPhraseConstraints();
result = result * PRIME + ($phraseConstraints == null ? 43 : $phraseConstraints.hashCode());
final long $value = Double.doubleToLongBits(this.getValue());
result = result * PRIME + (int) ($value >>> 32 ^ $value);
return result;
}
public static class SearchQueryBuilder { public static class SearchQueryBuilder {
private String compiledQuery; private String compiledQuery;
public final List<String> searchTermsInclude = new ArrayList<>(); public final List<String> searchTermsInclude = new ArrayList<>();
@ -130,7 +260,9 @@ public class SearchQuery {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints); return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
} }
/** If there are no ranking terms, promote the advice terms to ranking terms */ /**
* If there are no ranking terms, promote the advice terms to ranking terms
*/
public void promoteNonRankingTerms() { public void promoteNonRankingTerms() {
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
searchTermsInclude.addAll(searchTermsAdvice); searchTermsInclude.addAll(searchTermsAdvice);

View File

@ -1,6 +1,5 @@
package nu.marginalia.api.searchquery.model.query; package nu.marginalia.api.searchquery.model.query;
import lombok.*;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
@ -8,29 +7,207 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
import java.util.List; import java.util.List;
@ToString @Getter @Builder @With @AllArgsConstructor
public class SearchSpecification { public class SearchSpecification {
public SearchQuery query; public SearchQuery query;
/** If present and not empty, limit the search to these domain IDs */ /**
* If present and not empty, limit the search to these domain IDs
*/
public List<Integer> domains; public List<Integer> domains;
public String searchSetIdentifier; public String searchSetIdentifier;
public final String humanQuery; public final String humanQuery;
@Builder.Default public SpecificationLimit quality;
public final SpecificationLimit quality = SpecificationLimit.none(); public SpecificationLimit year;
@Builder.Default public SpecificationLimit size;
public final SpecificationLimit year = SpecificationLimit.none(); public SpecificationLimit rank;
@Builder.Default
public final SpecificationLimit size = SpecificationLimit.none();
@Builder.Default
public final SpecificationLimit rank = SpecificationLimit.none();
public final QueryLimits queryLimits; public final QueryLimits queryLimits;
public final QueryStrategy queryStrategy; public final QueryStrategy queryStrategy;
public final ResultRankingParameters rankingParams; public final ResultRankingParameters rankingParams;
public SearchSpecification(SearchQuery query,
List<Integer> domains,
String searchSetIdentifier,
String humanQuery,
SpecificationLimit quality,
SpecificationLimit year,
SpecificationLimit size,
SpecificationLimit rank,
QueryLimits queryLimits,
QueryStrategy queryStrategy,
ResultRankingParameters rankingParams)
{
this.query = query;
this.domains = domains;
this.searchSetIdentifier = searchSetIdentifier;
this.humanQuery = humanQuery;
this.quality = quality;
this.year = year;
this.size = size;
this.rank = rank;
this.queryLimits = queryLimits;
this.queryStrategy = queryStrategy;
this.rankingParams = rankingParams;
}
public static SearchSpecificationBuilder builder() {
return new SearchSpecificationBuilder();
}
public SearchQuery getQuery() {
return this.query;
}
public List<Integer> getDomains() {
return this.domains;
}
public String getSearchSetIdentifier() {
return this.searchSetIdentifier;
}
public String getHumanQuery() {
return this.humanQuery;
}
public SpecificationLimit getQuality() {
return this.quality;
}
public SpecificationLimit getYear() {
return this.year;
}
public SpecificationLimit getSize() {
return this.size;
}
public SpecificationLimit getRank() {
return this.rank;
}
public QueryLimits getQueryLimits() {
return this.queryLimits;
}
public QueryStrategy getQueryStrategy() {
return this.queryStrategy;
}
public ResultRankingParameters getRankingParams() {
return this.rankingParams;
}
public String toString() {
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
}
public static class SearchSpecificationBuilder {
private SearchQuery query;
private List<Integer> domains;
private String searchSetIdentifier;
private String humanQuery;
private SpecificationLimit quality$value;
private boolean quality$set;
private SpecificationLimit year$value;
private boolean year$set;
private SpecificationLimit size$value;
private boolean size$set;
private SpecificationLimit rank$value;
private boolean rank$set;
private QueryLimits queryLimits;
private QueryStrategy queryStrategy;
private ResultRankingParameters rankingParams;
SearchSpecificationBuilder() {
}
public SearchSpecificationBuilder query(SearchQuery query) {
this.query = query;
return this;
}
public SearchSpecificationBuilder domains(List<Integer> domains) {
this.domains = domains;
return this;
}
public SearchSpecificationBuilder searchSetIdentifier(String searchSetIdentifier) {
this.searchSetIdentifier = searchSetIdentifier;
return this;
}
public SearchSpecificationBuilder humanQuery(String humanQuery) {
this.humanQuery = humanQuery;
return this;
}
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
this.quality$value = quality;
this.quality$set = true;
return this;
}
public SearchSpecificationBuilder year(SpecificationLimit year) {
this.year$value = year;
this.year$set = true;
return this;
}
public SearchSpecificationBuilder size(SpecificationLimit size) {
this.size$value = size;
this.size$set = true;
return this;
}
public SearchSpecificationBuilder rank(SpecificationLimit rank) {
this.rank$value = rank;
this.rank$set = true;
return this;
}
public SearchSpecificationBuilder queryLimits(QueryLimits queryLimits) {
this.queryLimits = queryLimits;
return this;
}
public SearchSpecificationBuilder queryStrategy(QueryStrategy queryStrategy) {
this.queryStrategy = queryStrategy;
return this;
}
public SearchSpecificationBuilder rankingParams(ResultRankingParameters rankingParams) {
this.rankingParams = rankingParams;
return this;
}
public SearchSpecification build() {
SpecificationLimit quality$value = this.quality$value;
if (!this.quality$set) {
quality$value = SpecificationLimit.none();
}
SpecificationLimit year$value = this.year$value;
if (!this.year$set) {
year$value = SpecificationLimit.none();
}
SpecificationLimit size$value = this.size$value;
if (!this.size$set) {
size$value = SpecificationLimit.none();
}
SpecificationLimit rank$value = this.rank$value;
if (!this.rank$set) {
rank$value = SpecificationLimit.none();
}
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
}
public String toString() {
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
}
}
} }

View File

@ -1,7 +1,5 @@
package nu.marginalia.api.searchquery.model.results; package nu.marginalia.api.searchquery.model.results;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@ -9,8 +7,6 @@ import org.jetbrains.annotations.NotNull;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.List; import java.util.List;
@Getter
@ToString
public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResultItem> { public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResultItem> {
public final SearchResultItem rawIndexResult; public final SearchResultItem rawIndexResult;
@ -24,7 +20,9 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
@NotNull @NotNull
public final String format; public final String format;
/** Document features bitmask, see HtmlFeature */ /**
* Document features bitmask, see HtmlFeature
*/
public final int features; public final int features;
@Nullable @Nullable
@ -42,6 +40,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
public long documentId() { public long documentId() {
return rawIndexResult.getDocumentId(); return rawIndexResult.getDocumentId();
} }
public int domainId() { public int domainId() {
return rawIndexResult.getDomainId(); return rawIndexResult.getDomainId();
} }
@ -74,8 +73,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
int resultsFromDomain, int resultsFromDomain,
@Nullable @Nullable
ResultRankingDetails rankingDetails ResultRankingDetails rankingDetails
) ) {
{
this.rawIndexResult = rawIndexResult; this.rawIndexResult = rawIndexResult;
this.url = url; this.url = url;
this.title = title; this.title = title;
@ -94,11 +92,73 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
@Override @Override
public int compareTo(@NotNull DecoratedSearchResultItem o) { public int compareTo(@NotNull DecoratedSearchResultItem o) {
int diff = Double.compare(rankingScore, o.rankingScore); int diff = Double.compare(rankingScore, o.rankingScore);
if (diff == 0) if (diff == 0)
diff = Long.compare(documentId(), o.documentId()); diff = Long.compare(documentId(), o.documentId());
return diff; return diff;
} }
public SearchResultItem getRawIndexResult() {
return this.rawIndexResult;
}
public @NotNull EdgeUrl getUrl() {
return this.url;
}
public @NotNull String getTitle() {
return this.title;
}
public @NotNull String getDescription() {
return this.description;
}
public double getUrlQuality() {
return this.urlQuality;
}
public @NotNull String getFormat() {
return this.format;
}
public int getFeatures() {
return this.features;
}
@Nullable
public Integer getPubYear() {
return this.pubYear;
}
public long getDataHash() {
return this.dataHash;
}
public int getWordsTotal() {
return this.wordsTotal;
}
public long getBestPositions() {
return this.bestPositions;
}
public double getRankingScore() {
return this.rankingScore;
}
public int getResultsFromDomain() {
return this.resultsFromDomain;
}
@Nullable
public ResultRankingDetails getRankingDetails() {
return this.rankingDetails;
}
public String toString() {
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
}
} }

View File

@ -1,11 +1,9 @@
package nu.marginalia.api.searchquery.model.results; package nu.marginalia.api.searchquery.model.results;
import lombok.ToString;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import java.util.BitSet; import java.util.BitSet;
@ToString
public class ResultRankingContext { public class ResultRankingContext {
private final int docCount; private final int docCount;
public final ResultRankingParameters params; public final ResultRankingParameters params;
@ -43,4 +41,15 @@ public class ResultRankingContext {
return docCount; return docCount;
} }
@Override
public String toString() {
return "ResultRankingContext{" +
"docCount=" + docCount +
", params=" + params +
", regularMask=" + regularMask +
", ngramsMask=" + ngramsMask +
", fullCounts=" + fullCounts +
", priorityCounts=" + priorityCounts +
'}';
}
} }

View File

@ -1,33 +1,38 @@
package nu.marginalia.api.searchquery.model.results; package nu.marginalia.api.searchquery.model.results;
import lombok.*;
@Builder
@AllArgsConstructor
@ToString
@EqualsAndHashCode
@Getter // getter for the mustache template engine's behalf
public class ResultRankingParameters { public class ResultRankingParameters {
/** Tuning for BM25 when applied to full document matches */ /**
* Tuning for BM25 when applied to full document matches
*/
public final Bm25Parameters bm25Params; public final Bm25Parameters bm25Params;
/** Documents below this length are penalized */ /**
* Documents below this length are penalized
*/
public int shortDocumentThreshold; public int shortDocumentThreshold;
public double shortDocumentPenalty; public double shortDocumentPenalty;
/** Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good) */ /**
* Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good)
*/
public double domainRankBonus; public double domainRankBonus;
/** Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad) */ /**
* Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad)
*/
public double qualityPenalty; public double qualityPenalty;
/** Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want */ /**
* Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want
*/
public int shortSentenceThreshold; public int shortSentenceThreshold;
/** Magnitude of penalty for documents with low average sentence length */ /**
* Magnitude of penalty for documents with low average sentence length
*/
public double shortSentencePenalty; public double shortSentencePenalty;
public double bm25Weight; public double bm25Weight;
@ -40,13 +45,30 @@ public class ResultRankingParameters {
public boolean exportDebugData; public boolean exportDebugData;
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
this.bm25Params = bm25Params;
this.shortDocumentThreshold = shortDocumentThreshold;
this.shortDocumentPenalty = shortDocumentPenalty;
this.domainRankBonus = domainRankBonus;
this.qualityPenalty = qualityPenalty;
this.shortSentenceThreshold = shortSentenceThreshold;
this.shortSentencePenalty = shortSentencePenalty;
this.bm25Weight = bm25Weight;
this.tcfFirstPosition = tcfFirstPosition;
this.tcfVerbatim = tcfVerbatim;
this.tcfProximity = tcfProximity;
this.temporalBias = temporalBias;
this.temporalBiasWeight = temporalBiasWeight;
this.exportDebugData = exportDebugData;
}
public static ResultRankingParameters sensibleDefaults() { public static ResultRankingParameters sensibleDefaults() {
return builder() return builder()
.bm25Params(new Bm25Parameters(1.2, 0.5)) .bm25Params(new Bm25Parameters(1.2, 0.5))
.shortDocumentThreshold(2000) .shortDocumentThreshold(2000)
.shortDocumentPenalty(2.) .shortDocumentPenalty(2.)
.domainRankBonus(1/25.) .domainRankBonus(1 / 25.)
.qualityPenalty(1/15.) .qualityPenalty(1 / 15.)
.shortSentenceThreshold(2) .shortSentenceThreshold(2)
.shortSentencePenalty(5) .shortSentencePenalty(5)
.bm25Weight(1.) .bm25Weight(1.)
@ -59,7 +81,232 @@ public class ResultRankingParameters {
.build(); .build();
} }
public static ResultRankingParametersBuilder builder() {
return new ResultRankingParametersBuilder();
}
public Bm25Parameters getBm25Params() {
return this.bm25Params;
}
public int getShortDocumentThreshold() {
return this.shortDocumentThreshold;
}
public double getShortDocumentPenalty() {
return this.shortDocumentPenalty;
}
public double getDomainRankBonus() {
return this.domainRankBonus;
}
public double getQualityPenalty() {
return this.qualityPenalty;
}
public int getShortSentenceThreshold() {
return this.shortSentenceThreshold;
}
public double getShortSentencePenalty() {
return this.shortSentencePenalty;
}
public double getBm25Weight() {
return this.bm25Weight;
}
public double getTcfFirstPosition() {
return this.tcfFirstPosition;
}
public double getTcfVerbatim() {
return this.tcfVerbatim;
}
public double getTcfProximity() {
return this.tcfProximity;
}
public TemporalBias getTemporalBias() {
return this.temporalBias;
}
public double getTemporalBiasWeight() {
return this.temporalBiasWeight;
}
public boolean isExportDebugData() {
return this.exportDebugData;
}
public boolean equals(final Object o) {
if (o == this) return true;
if (!(o instanceof ResultRankingParameters)) return false;
final ResultRankingParameters other = (ResultRankingParameters) o;
if (!other.canEqual((Object) this)) return false;
final Object this$bm25Params = this.getBm25Params();
final Object other$bm25Params = other.getBm25Params();
if (this$bm25Params == null ? other$bm25Params != null : !this$bm25Params.equals(other$bm25Params))
return false;
if (this.getShortDocumentThreshold() != other.getShortDocumentThreshold()) return false;
if (Double.compare(this.getShortDocumentPenalty(), other.getShortDocumentPenalty()) != 0) return false;
if (Double.compare(this.getDomainRankBonus(), other.getDomainRankBonus()) != 0) return false;
if (Double.compare(this.getQualityPenalty(), other.getQualityPenalty()) != 0) return false;
if (this.getShortSentenceThreshold() != other.getShortSentenceThreshold()) return false;
if (Double.compare(this.getShortSentencePenalty(), other.getShortSentencePenalty()) != 0) return false;
if (Double.compare(this.getBm25Weight(), other.getBm25Weight()) != 0) return false;
if (Double.compare(this.getTcfFirstPosition(), other.getTcfFirstPosition()) != 0) return false;
if (Double.compare(this.getTcfVerbatim(), other.getTcfVerbatim()) != 0) return false;
if (Double.compare(this.getTcfProximity(), other.getTcfProximity()) != 0) return false;
final Object this$temporalBias = this.getTemporalBias();
final Object other$temporalBias = other.getTemporalBias();
if (this$temporalBias == null ? other$temporalBias != null : !this$temporalBias.equals(other$temporalBias))
return false;
if (Double.compare(this.getTemporalBiasWeight(), other.getTemporalBiasWeight()) != 0) return false;
if (this.isExportDebugData() != other.isExportDebugData()) return false;
return true;
}
protected boolean canEqual(final Object other) {
return other instanceof ResultRankingParameters;
}
public int hashCode() {
final int PRIME = 59;
int result = 1;
final Object $bm25Params = this.getBm25Params();
result = result * PRIME + ($bm25Params == null ? 43 : $bm25Params.hashCode());
result = result * PRIME + this.getShortDocumentThreshold();
final long $shortDocumentPenalty = Double.doubleToLongBits(this.getShortDocumentPenalty());
result = result * PRIME + (int) ($shortDocumentPenalty >>> 32 ^ $shortDocumentPenalty);
final long $domainRankBonus = Double.doubleToLongBits(this.getDomainRankBonus());
result = result * PRIME + (int) ($domainRankBonus >>> 32 ^ $domainRankBonus);
final long $qualityPenalty = Double.doubleToLongBits(this.getQualityPenalty());
result = result * PRIME + (int) ($qualityPenalty >>> 32 ^ $qualityPenalty);
result = result * PRIME + this.getShortSentenceThreshold();
final long $shortSentencePenalty = Double.doubleToLongBits(this.getShortSentencePenalty());
result = result * PRIME + (int) ($shortSentencePenalty >>> 32 ^ $shortSentencePenalty);
final long $bm25Weight = Double.doubleToLongBits(this.getBm25Weight());
result = result * PRIME + (int) ($bm25Weight >>> 32 ^ $bm25Weight);
final long $tcfFirstPosition = Double.doubleToLongBits(this.getTcfFirstPosition());
result = result * PRIME + (int) ($tcfFirstPosition >>> 32 ^ $tcfFirstPosition);
final long $tcfVerbatim = Double.doubleToLongBits(this.getTcfVerbatim());
result = result * PRIME + (int) ($tcfVerbatim >>> 32 ^ $tcfVerbatim);
final long $tcfProximity = Double.doubleToLongBits(this.getTcfProximity());
result = result * PRIME + (int) ($tcfProximity >>> 32 ^ $tcfProximity);
final Object $temporalBias = this.getTemporalBias();
result = result * PRIME + ($temporalBias == null ? 43 : $temporalBias.hashCode());
final long $temporalBiasWeight = Double.doubleToLongBits(this.getTemporalBiasWeight());
result = result * PRIME + (int) ($temporalBiasWeight >>> 32 ^ $temporalBiasWeight);
result = result * PRIME + (this.isExportDebugData() ? 79 : 97);
return result;
}
public String toString() {
return "ResultRankingParameters(bm25Params=" + this.getBm25Params() + ", shortDocumentThreshold=" + this.getShortDocumentThreshold() + ", shortDocumentPenalty=" + this.getShortDocumentPenalty() + ", domainRankBonus=" + this.getDomainRankBonus() + ", qualityPenalty=" + this.getQualityPenalty() + ", shortSentenceThreshold=" + this.getShortSentenceThreshold() + ", shortSentencePenalty=" + this.getShortSentencePenalty() + ", bm25Weight=" + this.getBm25Weight() + ", tcfFirstPosition=" + this.getTcfFirstPosition() + ", tcfVerbatim=" + this.getTcfVerbatim() + ", tcfProximity=" + this.getTcfProximity() + ", temporalBias=" + this.getTemporalBias() + ", temporalBiasWeight=" + this.getTemporalBiasWeight() + ", exportDebugData=" + this.isExportDebugData() + ")";
}
public enum TemporalBias { public enum TemporalBias {
RECENT, OLD, NONE RECENT, OLD, NONE
} }
public static class ResultRankingParametersBuilder {
private Bm25Parameters bm25Params;
private int shortDocumentThreshold;
private double shortDocumentPenalty;
private double domainRankBonus;
private double qualityPenalty;
private int shortSentenceThreshold;
private double shortSentencePenalty;
private double bm25Weight;
private double tcfFirstPosition;
private double tcfVerbatim;
private double tcfProximity;
private TemporalBias temporalBias;
private double temporalBiasWeight;
private boolean exportDebugData;
ResultRankingParametersBuilder() {
}
public ResultRankingParametersBuilder bm25Params(Bm25Parameters bm25Params) {
this.bm25Params = bm25Params;
return this;
}
public ResultRankingParametersBuilder shortDocumentThreshold(int shortDocumentThreshold) {
this.shortDocumentThreshold = shortDocumentThreshold;
return this;
}
public ResultRankingParametersBuilder shortDocumentPenalty(double shortDocumentPenalty) {
this.shortDocumentPenalty = shortDocumentPenalty;
return this;
}
public ResultRankingParametersBuilder domainRankBonus(double domainRankBonus) {
this.domainRankBonus = domainRankBonus;
return this;
}
public ResultRankingParametersBuilder qualityPenalty(double qualityPenalty) {
this.qualityPenalty = qualityPenalty;
return this;
}
public ResultRankingParametersBuilder shortSentenceThreshold(int shortSentenceThreshold) {
this.shortSentenceThreshold = shortSentenceThreshold;
return this;
}
public ResultRankingParametersBuilder shortSentencePenalty(double shortSentencePenalty) {
this.shortSentencePenalty = shortSentencePenalty;
return this;
}
public ResultRankingParametersBuilder bm25Weight(double bm25Weight) {
this.bm25Weight = bm25Weight;
return this;
}
public ResultRankingParametersBuilder tcfFirstPosition(double tcfFirstPosition) {
this.tcfFirstPosition = tcfFirstPosition;
return this;
}
public ResultRankingParametersBuilder tcfVerbatim(double tcfVerbatim) {
this.tcfVerbatim = tcfVerbatim;
return this;
}
public ResultRankingParametersBuilder tcfProximity(double tcfProximity) {
this.tcfProximity = tcfProximity;
return this;
}
public ResultRankingParametersBuilder temporalBias(TemporalBias temporalBias) {
this.temporalBias = temporalBias;
return this;
}
public ResultRankingParametersBuilder temporalBiasWeight(double temporalBiasWeight) {
this.temporalBiasWeight = temporalBiasWeight;
return this;
}
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
this.exportDebugData = exportDebugData;
return this;
}
public ResultRankingParameters build() {
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
}
public String toString() {
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
}
}
} }

View File

@ -1,7 +1,5 @@
package nu.marginalia.api.searchquery.model.results; package nu.marginalia.api.searchquery.model.results;
import lombok.AllArgsConstructor;
import lombok.Getter;
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors; import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@ -9,21 +7,30 @@ import org.jetbrains.annotations.NotNull;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
/** Represents a document matching a search query */ /**
@AllArgsConstructor @Getter * Represents a document matching a search query
*/
public class SearchResultItem implements Comparable<SearchResultItem> { public class SearchResultItem implements Comparable<SearchResultItem> {
/** Encoded ID that contains both the URL id and its ranking. This is /**
* probably not what you want, use getDocumentId() instead */ * Encoded ID that contains both the URL id and its ranking. This is
* probably not what you want, use getDocumentId() instead
*/
public final long combinedId; public final long combinedId;
/** Encoded document metadata */ /**
* Encoded document metadata
*/
public final long encodedDocMetadata; public final long encodedDocMetadata;
/** Encoded html features of document */ /**
* Encoded html features of document
*/
public final int htmlFeatures; public final int htmlFeatures;
/** How did the subqueries match against the document ? */ /**
* How did the subqueries match against the document ?
*/
public final List<SearchResultKeywordScore> keywordScores; public final List<SearchResultKeywordScore> keywordScores;
public boolean hasPrioTerm; public boolean hasPrioTerm;
@ -45,6 +52,17 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
this.scoreValue = score; this.scoreValue = score;
} }
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures, List<SearchResultKeywordScore> keywordScores, boolean hasPrioTerm, long bestPositions, DebugRankingFactors debugRankingFactors, double scoreValue) {
this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata;
this.htmlFeatures = htmlFeatures;
this.keywordScores = keywordScores;
this.hasPrioTerm = hasPrioTerm;
this.bestPositions = bestPositions;
this.debugRankingFactors = debugRankingFactors;
this.scoreValue = scoreValue;
}
public long getDocumentId() { public long getDocumentId() {
return UrlIdCodec.removeRank(combinedId); return UrlIdCodec.removeRank(combinedId);
@ -56,9 +74,11 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
/* Used for evaluation */ /* Used for evaluation */
private transient double scoreValue = Double.MAX_VALUE; private transient double scoreValue = Double.MAX_VALUE;
public void setScore(double score) { public void setScore(double score) {
scoreValue = score; scoreValue = score;
} }
public double getScore() { public double getScore() {
return scoreValue; return scoreValue;
} }
@ -81,7 +101,7 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
if (other == this) if (other == this)
return true; return true;
if (other instanceof SearchResultItem o) { if (other instanceof SearchResultItem o) {
return o.getDocumentId() == getDocumentId(); return o.getDocumentId() == getDocumentId();
} }
return false; return false;
} }
@ -96,4 +116,35 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
} }
public long getCombinedId() {
return this.combinedId;
}
public long getEncodedDocMetadata() {
return this.encodedDocMetadata;
}
public int getHtmlFeatures() {
return this.htmlFeatures;
}
public List<SearchResultKeywordScore> getKeywordScores() {
return this.keywordScores;
}
public boolean isHasPrioTerm() {
return this.hasPrioTerm;
}
public long getBestPositions() {
return this.bestPositions;
}
public DebugRankingFactors getDebugRankingFactors() {
return this.debugRankingFactors;
}
public double getScoreValue() {
return this.scoreValue;
}
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.api;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.IndexApiGrpc; import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem; import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery; import nu.marginalia.api.searchquery.RpcIndexQuery;
@ -51,7 +50,6 @@ public class IndexClient {
) {} ) {}
/** Execute a query on the index partitions and return the combined results. */ /** Execute a query on the index partitions and return the combined results. */
@SneakyThrows
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) { public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures = List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query) channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
import lombok.SneakyThrows;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
@ -40,9 +39,9 @@ class ForwardIndexConverterTest {
private Path docsSpanData; private Path docsSpanData;
int workSetSize = 512; int workSetSize = 512;
@BeforeEach @BeforeEach
@SneakyThrows void setUp() throws Exception {
void setUp() {
workDir = Files.createTempDirectory(getClass().getSimpleName()); workDir = Files.createTempDirectory(getClass().getSimpleName());
@ -75,7 +74,7 @@ class ForwardIndexConverterTest {
return UrlIdCodec.encodeId((int) domain, (int) url); return UrlIdCodec.encodeId((int) domain, (int) url);
} }
public void createEntry(IndexJournalSlopWriter writer, int id) { public void createEntry(IndexJournalSlopWriter writer, int id) throws IOException {
writer.put( writer.put(
createId(id, id/20), createId(id, id/20),
new SlopDocumentRecord.KeywordsProjection( new SlopDocumentRecord.KeywordsProjection(

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.journal; package nu.marginalia.index.journal;
import lombok.SneakyThrows;
import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.model.processed.SlopDocumentRecord;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn; import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
@ -53,8 +52,7 @@ public class IndexJournalSlopWriter extends SlopTable {
spansWriter = IndexJournalPage.spans.create(this); spansWriter = IndexJournalPage.spans.create(this);
} }
@SneakyThrows public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) {
combinedIdWriter.put(combinedId); combinedIdWriter.put(combinedId);
featuresWriter.put(keywordsProjection.htmlFeatures()); featuresWriter.put(keywordsProjection.htmlFeatures());

View File

@ -1,10 +1,9 @@
package nu.marginalia.index; package nu.marginalia.index;
import lombok.SneakyThrows;
import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.query.EntrySource; import nu.marginalia.index.query.EntrySource;
import nu.marginalia.sequence.io.BitReader;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.sequence.io.BitReader;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
@ -62,7 +61,6 @@ public class PrioIndexEntrySource implements EntrySource {
} }
@Override @Override
@SneakyThrows
@SuppressWarnings("preview") @SuppressWarnings("preview")
public void read(LongQueryBuffer buffer) { public void read(LongQueryBuffer buffer) {
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN); var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.construction.full; package nu.marginalia.index.construction.full;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor; import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
@ -78,35 +77,48 @@ public class FullIndexConstructor {
} }
} }
@SneakyThrows
private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) { private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) {
return FullPreindex
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
}
@SneakyThrows
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
var left = leftR.open();
var right = rightR.open();
try { try {
return FullPreindex.merge(tmpDir, left, right).closeToReference(); return FullPreindex
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
.closeToReference();
} }
finally { catch (IOException e) {
left.delete(); logger.error("Error constructing preindex", e);
right.delete(); throw new RuntimeException(e);
}
}
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
try {
var left = leftR.open();
var right = rightR.open();
try {
return FullPreindex.merge(tmpDir, left, right).closeToReference();
} finally {
left.delete();
right.delete();
}
}
catch (IOException e) {
logger.error("Error merging preindex", e);
throw new RuntimeException(e);
} }
} }
@SneakyThrows
private void finalizeIndex(FullPreindexReference finalPR) { private void finalizeIndex(FullPreindexReference finalPR) {
var finalP = finalPR.open(); try {
finalP.finalizeIndex(outputFileDocs, outputFileWords); var finalP = finalPR.open();
finalP.delete(); finalP.finalizeIndex(outputFileDocs, outputFileWords);
finalP.delete();
}
catch (IOException e) {
logger.error("Error finalizing index", e);
throw new RuntimeException(e);
}
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.construction.full; package nu.marginalia.index.construction.full;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
@ -113,7 +112,6 @@ public class FullPreindexDocuments {
} }
} }
@SneakyThrows
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) { private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) {
var iter = segments.iterator(RECORD_SIZE_LONGS); var iter = segments.iterator(RECORD_SIZE_LONGS);

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.construction.prio; package nu.marginalia.index.construction.prio;
import lombok.SneakyThrows;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalPage; import nu.marginalia.index.journal.IndexJournalPage;
@ -73,35 +72,47 @@ public class PrioIndexConstructor {
} }
} }
@SneakyThrows
private PrioPreindexReference construct(IndexJournalPage journalInstance) { private PrioPreindexReference construct(IndexJournalPage journalInstance) {
return PrioPreindex
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
.closeToReference();
}
@SneakyThrows
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
var left = leftR.open();
var right = rightR.open();
try { try {
return PrioPreindex.merge(tmpDir, left, right).closeToReference(); return PrioPreindex
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
.closeToReference();
} }
finally { catch (IOException ex) {
left.delete(); logger.error("Failed to construct preindex", ex);
right.delete(); throw new RuntimeException(ex);
} }
}
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
try {
var left = leftR.open();
var right = rightR.open();
try {
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
} finally {
left.delete();
right.delete();
}
}
catch (IOException ex) {
logger.error("Failed to merge preindex", ex);
throw new RuntimeException(ex);
}
} }
@SneakyThrows
private void finalizeIndex(PrioPreindexReference finalPR) { private void finalizeIndex(PrioPreindexReference finalPR) {
var finalP = finalPR.open(); try {
finalP.finalizeIndex(outputFileDocs, outputFileWords); var finalP = finalPR.open();
finalP.delete(); finalP.finalizeIndex(outputFileDocs, outputFileWords);
finalP.delete();
}
catch (IOException ex) {
logger.error("Failed to finalize preindex", ex);
throw new RuntimeException(ex);
}
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.construction.prio; package nu.marginalia.index.construction.prio;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
@ -97,7 +96,6 @@ public class PrioPreindexDocuments {
} }
} }
@SneakyThrows
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) { private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) {
var iter = segments.iterator(RECORD_SIZE_LONGS); var iter = segments.iterator(RECORD_SIZE_LONGS);

View File

@ -7,7 +7,6 @@ import io.prometheus.client.Counter;
import io.prometheus.client.Gauge; import io.prometheus.client.Gauge;
import io.prometheus.client.Histogram; import io.prometheus.client.Histogram;
import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongArrayList;
import lombok.SneakyThrows;
import nu.marginalia.api.searchquery.IndexApiGrpc; import nu.marginalia.api.searchquery.IndexApiGrpc;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem; import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.RpcIndexQuery; import nu.marginalia.api.searchquery.RpcIndexQuery;
@ -109,7 +108,6 @@ public class IndexGrpcService
} }
// GRPC endpoint // GRPC endpoint
@SneakyThrows
public void query(RpcIndexQuery request, public void query(RpcIndexQuery request,
StreamObserver<RpcDecoratedResultItem> responseObserver) { StreamObserver<RpcDecoratedResultItem> responseObserver) {
@ -157,9 +155,14 @@ public class IndexGrpcService
// exists for test access // exists for test access
@SneakyThrows
List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) { List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))); try {
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
}
catch (Exception ex) {
logger.error("Error in handling request", ex);
return List.of();
}
} }
private SearchSet getSearchSet(SearchSpecification specsSet) { private SearchSet getSearchSet(SearchSpecification specsSet) {

View File

@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import org.jgrapht.Graph; import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultDirectedGraph;
@ -20,27 +19,32 @@ public class InvertedLinkGraphSource extends AbstractGraphSource {
super(dataSource); super(dataSource);
this.graphClient = graphClient; this.graphClient = graphClient;
} }
@SneakyThrows
@Override @Override
public Graph<Integer, ?> getGraph() { public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class); try {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
addVertices(graph); addVertices(graph);
var allLinks = graphClient.getAllDomainLinks(); var allLinks = graphClient.getAllDomainLinks();
var iter = allLinks.iterator(); var iter = allLinks.iterator();
while (iter.advance()) { while (iter.advance()) {
if (!graph.containsVertex(iter.dest())) { if (!graph.containsVertex(iter.dest())) {
continue; continue;
} }
if (!graph.containsVertex(iter.source())) { if (!graph.containsVertex(iter.source())) {
continue; continue;
}
// Invert the edge
graph.addEdge(iter.dest(), iter.source());
} }
// Invert the edge return graph;
graph.addEdge(iter.dest(), iter.source()); }
catch (Exception ex) {
throw new RuntimeException(ex);
} }
return graph;
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient; import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
import org.jgrapht.Graph; import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultDirectedGraph;
@ -18,26 +17,31 @@ public class LinkGraphSource extends AbstractGraphSource {
this.graphClient = graphClient; this.graphClient = graphClient;
} }
@SneakyThrows
@Override @Override
public Graph<Integer, ?> getGraph() { public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class); try {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
addVertices(graph); addVertices(graph);
var allLinks = graphClient.getAllDomainLinks(); var allLinks = graphClient.getAllDomainLinks();
var iter = allLinks.iterator(); var iter = allLinks.iterator();
while (iter.advance()) { while (iter.advance()) {
if (!graph.containsVertex(iter.dest())) { if (!graph.containsVertex(iter.dest())) {
continue; continue;
} }
if (!graph.containsVertex(iter.source())) { if (!graph.containsVertex(iter.source())) {
continue; continue;
}
graph.addEdge(iter.source(), iter.dest());
} }
graph.addEdge(iter.source(), iter.dest()); return graph;
}
catch (Exception ex) {
throw new RuntimeException(ex);
} }
return graph;
} }
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.SneakyThrows;
import org.jgrapht.Graph; import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultUndirectedWeightedGraph; import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
import org.jgrapht.graph.DefaultWeightedEdge; import org.jgrapht.graph.DefaultWeightedEdge;
@ -35,14 +34,13 @@ public class SimilarityGraphSource extends AbstractGraphSource {
} }
} }
@SneakyThrows
@Override @Override
public Graph<Integer, ?> getGraph() { public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class); Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
addVertices(graph);
try (var conn = dataSource.getConnection()) { try (var conn = dataSource.getConnection()) {
addVertices(graph);
try (var stmt = conn.prepareStatement(""" try (var stmt = conn.prepareStatement("""
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
FROM EC_DOMAIN_NEIGHBORS_2 FROM EC_DOMAIN_NEIGHBORS_2
@ -67,6 +65,9 @@ public class SimilarityGraphSource extends AbstractGraphSource {
} }
} }
} }
catch (SQLException ex) {
throw new RuntimeException(ex);
}
return graph; return graph;
} }

View File

@ -311,7 +311,9 @@ public class CombinedIndexReaderTest {
} }
void load() throws IOException, SQLException, URISyntaxException { void load() throws IOException, SQLException, URISyntaxException {
allData.forEach((doc, words) -> { for (Map.Entry<Long, List<MockDataKeyword>> entry : allData.entrySet()) {
final Long doc = entry.getKey();
final List<MockDataKeyword> words = entry.getValue();
var meta = metaByDoc.get(doc); var meta = metaByDoc.get(doc);
@ -320,7 +322,7 @@ public class CombinedIndexReaderTest {
for (int i = 0; i < words.size(); i++) { for (int i = 0; i < words.size(); i++) {
metadata[i] = words.get(i).termMetadata; metadata[i] = words.get(i).termMetadata;
} }
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList(); var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
indexJournalWriter.put(doc, indexJournalWriter.put(doc,
new SlopDocumentRecord.KeywordsProjection( new SlopDocumentRecord.KeywordsProjection(
@ -335,7 +337,7 @@ public class CombinedIndexReaderTest {
new byte[0], new byte[0],
List.of() List.of()
)); ));
}); }
var linkdbWriter = new DocumentDbWriter( var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME) IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)

View File

@ -2,7 +2,6 @@ package nu.marginalia.index;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.IndexLocations; import nu.marginalia.IndexLocations;
import nu.marginalia.api.searchquery.RpcDecoratedResultItem; import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint; import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
@ -378,8 +377,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
return UrlIdCodec.encodeId((32 - (id % 32)), id); return UrlIdCodec.encodeId((32 - (id % 32)), id);
} }
@SneakyThrows public void loadData(DocumentDbWriter ldbw, int id) throws Exception {
public void loadData(DocumentDbWriter ldbw, int id) {
int[] factors = IntStream int[] factors = IntStream
.rangeClosed(1, id) .rangeClosed(1, id)
.filter(v -> (id % v) == 0) .filter(v -> (id % v) == 0)
@ -423,8 +421,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
} }
@SneakyThrows public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) throws Exception {
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
long fullId = UrlIdCodec.encodeId(domain, id); long fullId = UrlIdCodec.encodeId(domain, id);

View File

@ -532,8 +532,9 @@ public class IndexQueryServiceIntegrationTest {
} }
void load() throws IOException, SQLException, URISyntaxException { void load() throws IOException, SQLException, URISyntaxException {
allData.forEach((doc, words) -> { for (Map.Entry<Long, List<MockDataKeyword>> entry : allData.entrySet()) {
Long doc = entry.getKey();
List<MockDataKeyword> words = entry.getValue();
var meta = metaByDoc.get(doc); var meta = metaByDoc.get(doc);
List<String> keywords = words.stream().map(w -> w.keyword).toList(); List<String> keywords = words.stream().map(w -> w.keyword).toList();
@ -561,7 +562,7 @@ public class IndexQueryServiceIntegrationTest {
new byte[0], new byte[0],
List.of() List.of()
)); ));
}); }
var linkdbWriter = new DocumentDbWriter( var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME) IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)

View File

@ -1,6 +1,5 @@
package nu.marginalia.ranking.domains; package nu.marginalia.ranking.domains;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.domains.data.GraphSource; import nu.marginalia.ranking.domains.data.GraphSource;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -43,14 +42,14 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
return idToName.get(id); return idToName.get(id);
} }
@SneakyThrows
@Override @Override
public Graph<Integer, ?> getGraph() { public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class); Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
idToName = new HashMap<>(); idToName = new HashMap<>();
try (var stream = Files try (var stream = Files
.lines(domainDataPath)) { .lines(domainDataPath))
{
stream.skip(1) stream.skip(1)
.mapMultiToInt((line, c) -> { .mapMultiToInt((line, c) -> {
@ -65,6 +64,9 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
}) })
.forEach(graph::addVertex); .forEach(graph::addVertex);
} }
catch (Exception e) {
throw new RuntimeException(e);
}
for (var path : linksDataPaths) { for (var path : linksDataPaths) {
try (var data = LongArrayFactory.mmapForReadingConfined(path)) { try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
@ -80,8 +82,12 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
} }
}); });
} }
catch (Exception e) {
throw new RuntimeException(e);
}
} }
return graph; return graph;
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.ranking.domains; package nu.marginalia.ranking.domains;
import lombok.SneakyThrows;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.ranking.domains.data.GraphSource; import nu.marginalia.ranking.domains.data.GraphSource;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -44,7 +43,6 @@ public class TestGraphSourceForLinkData implements GraphSource {
return idToName.get(id); return idToName.get(id);
} }
@SneakyThrows
@Override @Override
public Graph<Integer, ?> getGraph() { public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class); Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
@ -66,6 +64,9 @@ public class TestGraphSourceForLinkData implements GraphSource {
}) })
.forEach(graph::addVertex); .forEach(graph::addVertex);
} }
catch (Exception e) {
throw new RuntimeException(e);
}
for (var path : linksDataPaths) { for (var path : linksDataPaths) {
try (var data = LongArrayFactory.mmapForReadingConfined(path)) { try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
@ -81,8 +82,12 @@ public class TestGraphSourceForLinkData implements GraphSource {
} }
}); });
} }
catch (Exception e) {
throw new RuntimeException(e);
}
} }
return graph; return graph;
} }

View File

@ -1,12 +1,12 @@
package nu.marginalia.ranking.domains; package nu.marginalia.ranking.domains;
import lombok.SneakyThrows;
import nu.marginalia.ranking.domains.data.GraphSource; import nu.marginalia.ranking.domains.data.GraphSource;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.jgrapht.Graph; import org.jgrapht.Graph;
import org.jgrapht.graph.DefaultUndirectedWeightedGraph; import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
import org.jgrapht.graph.DefaultWeightedEdge; import org.jgrapht.graph.DefaultWeightedEdge;
import java.io.IOException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.Paths; import java.nio.file.Paths;
@ -33,7 +33,6 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
return idToName.get(id); return idToName.get(id);
} }
@SneakyThrows
@Override @Override
public Graph<Integer, ?> getGraph() { public Graph<Integer, ?> getGraph() {
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class); Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
@ -55,6 +54,9 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
}) })
.forEach(graph::addVertex); .forEach(graph::addVertex);
} }
catch (IOException e) {
throw new RuntimeException(e);
}
try (var stream = Files try (var stream = Files
.lines(similarityDataPath)) { .lines(similarityDataPath)) {
@ -71,6 +73,9 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
} }
}); });
} }
catch (IOException e) {
throw new RuntimeException(e);
}
return graph; return graph;
} }

View File

@ -1,12 +1,12 @@
package nu.marginalia.util; package nu.marginalia.util;
import lombok.SneakyThrows;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.util.Iterator; import java.util.Iterator;
import java.util.NoSuchElementException; import java.util.NoSuchElementException;
import java.util.concurrent.*; import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer; import java.util.function.Consumer;
@ -23,12 +23,16 @@ public class ProcessingIterator<T> implements Iterator<T> {
private T next = null; private T next = null;
@SneakyThrows
ProcessingIterator(SimpleBlockingThreadPool pool, int queueSize, ProcessingJob<T> task) { ProcessingIterator(SimpleBlockingThreadPool pool, int queueSize, ProcessingJob<T> task) {
queue = new LinkedBlockingQueue<>(queueSize); queue = new LinkedBlockingQueue<>(queueSize);
this.pool = pool; this.pool = pool;
pool.submit(() -> executeJob(task)); try {
pool.submit(() -> executeJob(task));
}
catch (Exception e) {
logger.warn("Exception while processing", e);
}
} }
public static Factory factory(int queueSize, int parallelism) { public static Factory factory(int queueSize, int parallelism) {
@ -45,15 +49,19 @@ public class ProcessingIterator<T> implements Iterator<T> {
} }
} }
@SneakyThrows
private void executeTask(Task<T> task) { private void executeTask(Task<T> task) {
pool.submit(() -> { try {
try { pool.submit(() -> {
queue.put(task.get()); try {
} catch (Exception e) { queue.put(task.get());
logger.warn("Exception while processing", e); } catch (Exception e) {
} logger.warn("Exception while processing", e);
}); }
});
}
catch (Exception e) {
logger.warn("Exception while processing", e);
}
} }
/** Returns true if there are more documents to be processed. /** Returns true if there are more documents to be processed.
@ -63,17 +71,21 @@ public class ProcessingIterator<T> implements Iterator<T> {
* (or synchronize between the two) * (or synchronize between the two)
*/ */
@Override @Override
@SneakyThrows
public boolean hasNext() { public boolean hasNext() {
if (next != null) if (next != null)
return true; return true;
do { try {
next = queue.poll(50, TimeUnit.MILLISECONDS); do {
if (next != null) { next = queue.poll(50, TimeUnit.MILLISECONDS);
return true; if (next != null) {
} return true;
} while (expectMore()); }
} while (expectMore());
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
return false; return false;
} }
@ -96,7 +108,6 @@ public class ProcessingIterator<T> implements Iterator<T> {
* <p> * <p>
* If this is run after hasNext() returns false, a NoSuchElementException is thrown. * If this is run after hasNext() returns false, a NoSuchElementException is thrown.
*/ */
@SneakyThrows
@Override @Override
public T next() { public T next() {
if (!hasNext()) { if (!hasNext()) {

View File

@ -2,7 +2,6 @@ package nu.marginalia.language.filter;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Singleton; import com.google.inject.Singleton;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.language.encoding.UnicodeRanges; import nu.marginalia.language.encoding.UnicodeRanges;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
@ -45,7 +44,6 @@ public class LanguageFilter {
} }
@Inject @Inject
@SneakyThrows
public LanguageFilter(LanguageModels lm) { public LanguageFilter(LanguageModels lm) {
try { try {
languagePredictionModel1 = new UngaBungaLanguagePredictionModel(); languagePredictionModel1 = new UngaBungaLanguagePredictionModel();

View File

@ -1,12 +1,9 @@
package nu.marginalia.language.model; package nu.marginalia.language.model;
import lombok.AllArgsConstructor;
import lombok.Getter;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
import java.util.Objects; import java.util.Objects;
@AllArgsConstructor @Getter
public class WordRep implements Comparable<WordRep> { public class WordRep implements Comparable<WordRep> {
public WordRep(DocumentSentence sent, WordSpan span) { public WordRep(DocumentSentence sent, WordSpan span) {
@ -22,6 +19,13 @@ public class WordRep implements Comparable<WordRep> {
public final String stemmed; public final String stemmed;
private final int hashCode; private final int hashCode;
public WordRep(int length, String word, String stemmed, int hashCode) {
this.length = length;
this.word = word;
this.stemmed = stemmed;
this.hashCode = hashCode;
}
@Override @Override
public int compareTo(@NotNull WordRep o) { public int compareTo(@NotNull WordRep o) {
return word.compareTo(o.word); return word.compareTo(o.word);
@ -43,4 +47,20 @@ public class WordRep implements Comparable<WordRep> {
} }
return false; return false;
} }
public int getLength() {
return this.length;
}
public String getWord() {
return this.word;
}
public String getStemmed() {
return this.stemmed;
}
public int getHashCode() {
return this.hashCode;
}
} }

View File

@ -1,17 +1,20 @@
package nu.marginalia.language.model; package nu.marginalia.language.model;
import lombok.AllArgsConstructor;
import lombok.EqualsAndHashCode;
import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.NotNull;
@AllArgsConstructor @EqualsAndHashCode public class WordSpan implements Comparable<WordSpan> {
public class WordSpan implements Comparable<WordSpan>{
public final int start; public final int start;
public final int end; public final int end;
public WordSpan(int start, int end) {
this.start = start;
this.end = end;
}
public int size() { public int size() {
return end - start; return end - start;
} }
@Override @Override
public int compareTo(@NotNull WordSpan o) { public int compareTo(@NotNull WordSpan o) {
return start - o.start; return start - o.start;
@ -30,8 +33,7 @@ public class WordSpan implements Comparable<WordSpan>{
} }
if (start < other.start) { if (start < other.start) {
return end - other.start; return end - other.start;
} } else {
else {
return other.end - start; return other.end - start;
} }
@ -40,4 +42,26 @@ public class WordSpan implements Comparable<WordSpan>{
public String toString() { public String toString() {
return String.format("WordSpan[%s,%s]", start, end); return String.format("WordSpan[%s,%s]", start, end);
} }
public boolean equals(final Object o) {
if (o == this) return true;
if (!(o instanceof WordSpan)) return false;
final WordSpan other = (WordSpan) o;
if (!other.canEqual((Object) this)) return false;
if (this.start != other.start) return false;
if (this.end != other.end) return false;
return true;
}
protected boolean canEqual(final Object other) {
return other instanceof WordSpan;
}
public int hashCode() {
final int PRIME = 59;
int result = 1;
result = result * PRIME + this.start;
result = result * PRIME + this.end;
return result;
}
} }

View File

@ -2,7 +2,6 @@ package nu.marginalia.language.sentence;
import com.github.datquocnguyen.RDRPOSTagger; import com.github.datquocnguyen.RDRPOSTagger;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentLanguageData;
import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.DocumentSentence;
@ -46,7 +45,7 @@ public class SentenceExtractor {
static final int MAX_SENTENCE_LENGTH = 250; static final int MAX_SENTENCE_LENGTH = 250;
static final int MAX_SENTENCE_COUNT = 1000; static final int MAX_SENTENCE_COUNT = 1000;
@SneakyThrows @Inject @Inject
public SentenceExtractor(LanguageModels models) public SentenceExtractor(LanguageModels models)
{ {
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) { try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {

View File

@ -1,6 +1,9 @@
package nu.marginalia.actor; package nu.marginalia.actor;
import nu.marginalia.actor.prototype.ActorPrototype; import nu.marginalia.actor.prototype.ActorPrototype;
import nu.marginalia.actor.state.ActorResumeBehavior;
import nu.marginalia.actor.state.ActorStateInstance;
import nu.marginalia.actor.state.ActorStateTransition;
import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MessageQueueFactory;
import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessage;
import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqMessageState;
@ -8,7 +11,6 @@ import nu.marginalia.mq.inbox.MqInboxResponse;
import nu.marginalia.mq.inbox.MqSubscription; import nu.marginalia.mq.inbox.MqSubscription;
import nu.marginalia.mq.inbox.MqSynchronousInbox; import nu.marginalia.mq.inbox.MqSynchronousInbox;
import nu.marginalia.mq.outbox.MqOutbox; import nu.marginalia.mq.outbox.MqOutbox;
import nu.marginalia.actor.state.*;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -305,33 +307,38 @@ public class ActorStateMachine {
return state; return state;
} }
public void abortExecution() throws Exception { public void abortExecution() {
// Create a fake message to abort the execution try {
// This helps make sense of the queue when debugging // Create a fake message to abort the execution
// and also permits the real termination message to have an // This helps make sense of the queue when debugging
// unique expected ID // and also permits the real termination message to have an
// unique expected ID
long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution"); long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution");
// Set it as dead to clean up the queue from mystery ACK messages // Set it as dead to clean up the queue from mystery ACK messages
smOutbox.flagAsDead(abortMsgId); smOutbox.flagAsDead(abortMsgId);
// Set the expected message to the abort message, // Set the expected message to the abort message,
// technically there's a slight chance of a race condition here, // technically there's a slight chance of a race condition here,
// which will cause this message to be ERR'd and the process to // which will cause this message to be ERR'd and the process to
// continue, but it's very unlikely and the worst that can happen // continue, but it's very unlikely and the worst that can happen
// is you have to abort twice. // is you have to abort twice.
expectedMessage = ExpectedMessage.expectId(abortMsgId); expectedMessage = ExpectedMessage.expectId(abortMsgId);
// Add a state transition to the monitor state, causing it to reset the state machine to the initial state // Add a state transition to the monitor state, causing it to reset the state machine to the initial state
// (or if no monitor state is defined, set it to the final state) // (or if no monitor state is defined, set it to the final state)
smOutbox.sendNotice(abortMsgId, finalState.name(), ""); smOutbox.sendNotice(abortMsgId, finalState.name(), "");
// Dislodge the current task with an interrupt. // Dislodge the current task with an interrupt.
// It's actually fine if we accidentally interrupt the wrong thread // It's actually fine if we accidentally interrupt the wrong thread
// (i.e. the abort task), since it shouldn't be doing anything interruptable // (i.e. the abort task), since it shouldn't be doing anything interruptable
smInbox.abortCurrentTask(); smInbox.abortCurrentTask();
}
catch (Exception e) {
logger.error("Failed to abort execution", e);
}
} }
/** Returns true if there is an INITIAL state that requires no parameters */ /** Returns true if there is an INITIAL state that requires no parameters */

View File

@ -1,6 +1,5 @@
package nu.marginalia.mq.inbox; package nu.marginalia.mq.inbox;
import lombok.SneakyThrows;
import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.MqMessage;
import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.MqMessageState;
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry; import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
@ -67,8 +66,7 @@ public class MqSingleShotInbox {
* @param predicate A predicate that must be true for the message to be stolen * @param predicate A predicate that must be true for the message to be stolen
* @return The stolen message, or empty if no message was stolen * @return The stolen message, or empty if no message was stolen
*/ */
@SneakyThrows public Optional<MqMessage> stealMessage(Predicate<MqMessage> predicate) throws SQLException {
public Optional<MqMessage> stealMessage(Predicate<MqMessage> predicate) {
for (var message : persistence.eavesdrop(inboxName, 5)) { for (var message : persistence.eavesdrop(inboxName, 5)) {
if (predicate.test(message)) { if (predicate.test(message)) {
persistence.changeOwner(message.msgId(), instanceUUID, -1); persistence.changeOwner(message.msgId(), instanceUUID, -1);

View File

@ -1,8 +1,9 @@
package nu.marginalia.term_frequency_dict; package nu.marginalia.term_frequency_dict;
import ca.rmen.porterstemmer.PorterStemmer; import ca.rmen.porterstemmer.PorterStemmer;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
@ -11,9 +12,7 @@ import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.google.inject.Inject; import java.io.IOException;
import com.google.inject.Singleton;
import java.io.*;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -29,13 +28,11 @@ public class TermFrequencyDict {
public static final long DOC_COUNT_KEY = ~0L; public static final long DOC_COUNT_KEY = ~0L;
@Inject @Inject
public TermFrequencyDict(@NotNull LanguageModels models) { public TermFrequencyDict(@NotNull LanguageModels models) throws IOException {
this(models.termFrequencies); this(models.termFrequencies);
} }
@SneakyThrows public TermFrequencyDict(Path file) throws IOException {
public TermFrequencyDict(Path file) {
wordRates = load(file); wordRates = load(file);
logger.info("Read {} N-grams frequencies", wordRates.size()); logger.info("Read {} N-grams frequencies", wordRates.size());
} }

View File

@ -31,8 +31,13 @@ public class DocumentKeywordExtractor {
// for tests // for tests
public DocumentKeywordExtractor() { public DocumentKeywordExtractor() {
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels()); try {
this.keywordExtractor = new KeywordExtractor(); this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
this.keywordExtractor = new KeywordExtractor();
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.keyword; package nu.marginalia.keyword;
import lombok.Builder;
import nu.marginalia.keyword.extractors.NameLikeKeywords; import nu.marginalia.keyword.extractors.NameLikeKeywords;
import nu.marginalia.keyword.extractors.SubjectLikeKeywords; import nu.marginalia.keyword.extractors.SubjectLikeKeywords;
import nu.marginalia.keyword.extractors.TitleKeywords; import nu.marginalia.keyword.extractors.TitleKeywords;
@ -14,19 +13,21 @@ class KeywordMetadata {
private final SubjectLikeKeywords subjectLikeKeywords; private final SubjectLikeKeywords subjectLikeKeywords;
private final UrlKeywords urlKeywords; private final UrlKeywords urlKeywords;
@Builder
public KeywordMetadata( public KeywordMetadata(
TitleKeywords titleKeywords, TitleKeywords titleKeywords,
NameLikeKeywords nameLikeKeywords, NameLikeKeywords nameLikeKeywords,
SubjectLikeKeywords subjectLikeKeywords, SubjectLikeKeywords subjectLikeKeywords,
UrlKeywords urlKeywords) UrlKeywords urlKeywords) {
{
this.titleKeywords = titleKeywords; this.titleKeywords = titleKeywords;
this.nameLikeKeywords = nameLikeKeywords; this.nameLikeKeywords = nameLikeKeywords;
this.subjectLikeKeywords = subjectLikeKeywords; this.subjectLikeKeywords = subjectLikeKeywords;
this.urlKeywords = urlKeywords; this.urlKeywords = urlKeywords;
} }
public static KeywordMetadataBuilder builder() {
return new KeywordMetadataBuilder();
}
public byte getMetadataForWord(String stemmed) { public byte getMetadataForWord(String stemmed) {
byte flags = 0; byte flags = 0;
@ -54,4 +55,41 @@ class KeywordMetadata {
return flags; return flags;
} }
public static class KeywordMetadataBuilder {
private TitleKeywords titleKeywords;
private NameLikeKeywords nameLikeKeywords;
private SubjectLikeKeywords subjectLikeKeywords;
private UrlKeywords urlKeywords;
KeywordMetadataBuilder() {
}
public KeywordMetadataBuilder titleKeywords(TitleKeywords titleKeywords) {
this.titleKeywords = titleKeywords;
return this;
}
public KeywordMetadataBuilder nameLikeKeywords(NameLikeKeywords nameLikeKeywords) {
this.nameLikeKeywords = nameLikeKeywords;
return this;
}
public KeywordMetadataBuilder subjectLikeKeywords(SubjectLikeKeywords subjectLikeKeywords) {
this.subjectLikeKeywords = subjectLikeKeywords;
return this;
}
public KeywordMetadataBuilder urlKeywords(UrlKeywords urlKeywords) {
this.urlKeywords = urlKeywords;
return this;
}
public KeywordMetadata build() {
return new KeywordMetadata(this.titleKeywords, this.nameLikeKeywords, this.subjectLikeKeywords, this.urlKeywords);
}
public String toString() {
return "KeywordMetadata.KeywordMetadataBuilder(titleKeywords=" + this.titleKeywords + ", nameLikeKeywords=" + this.nameLikeKeywords + ", subjectLikeKeywords=" + this.subjectLikeKeywords + ", urlKeywords=" + this.urlKeywords + ")";
}
}
} }

View File

@ -4,7 +4,6 @@ import gnu.trove.list.array.TByteArrayList;
import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntList; import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
import lombok.Getter;
import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.model.idx.CodedWordSpan; import nu.marginalia.model.idx.CodedWordSpan;
import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordFlags;
@ -15,13 +14,14 @@ import org.slf4j.LoggerFactory;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
import java.util.*; import java.util.*;
@Getter
public class DocumentKeywordsBuilder { public class DocumentKeywordsBuilder {
public final Object2ByteOpenHashMap<String> wordToMeta; public final Object2ByteOpenHashMap<String> wordToMeta;
public final HashMap<String, IntList> wordToPos; public final HashMap<String, IntList> wordToPos;
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>(); public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
/** These ware keywords that had signals of high relevance */ /**
* These ware keywords that had signals of high relevance
*/
public final Set<String> importantWords = new HashSet<>(); public final Set<String> importantWords = new HashSet<>();
// |------64 letters is this long-------------------------------| // |------64 letters is this long-------------------------------|
@ -64,7 +64,7 @@ public class DocumentKeywordsBuilder {
wordSpans.forEach((tag, spansForTag) -> { wordSpans.forEach((tag, spansForTag) -> {
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start)); spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
var positionsForTag = new IntArrayList(spansForTag.size()*2); var positionsForTag = new IntArrayList(spansForTag.size() * 2);
for (var span : spansForTag) { for (var span : spansForTag) {
positionsForTag.add(span.start()); positionsForTag.add(span.start());
positionsForTag.add(span.end()); positionsForTag.add(span.end());
@ -77,7 +77,7 @@ public class DocumentKeywordsBuilder {
} }
public DocumentKeywordsBuilder(int capacity) { public DocumentKeywordsBuilder(int capacity) {
wordToMeta = new Object2ByteOpenHashMap<>(capacity); wordToMeta = new Object2ByteOpenHashMap<>(capacity);
wordToPos = new HashMap<>(capacity); wordToPos = new HashMap<>(capacity);
} }
@ -101,7 +101,7 @@ public class DocumentKeywordsBuilder {
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) { public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
flagWords.forEach(word -> flagWords.forEach(word ->
wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b)) wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte) (a | b))
); );
} }
@ -116,7 +116,7 @@ public class DocumentKeywordsBuilder {
public List<String> getWordsWithAnyFlag(long flags) { public List<String> getWordsWithAnyFlag(long flags) {
List<String> ret = new ArrayList<>(); List<String> ret = new ArrayList<>();
for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) { for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext(); ) {
var entry = iter.next(); var entry = iter.next();
if ((flags & entry.getByteValue()) != 0) { if ((flags & entry.getByteValue()) != 0) {
ret.add(entry.getKey()); ret.add(entry.getKey());
@ -159,6 +159,30 @@ public class DocumentKeywordsBuilder {
return sb.append(']').toString(); return sb.append(']').toString();
} }
public Object2ByteOpenHashMap<String> getWordToMeta() {
return this.wordToMeta;
}
public HashMap<String, IntList> getWordToPos() {
return this.wordToPos;
}
public Map<Character, List<DocumentWordSpan>> getWordSpans() {
return this.wordSpans;
}
public Set<String> getImportantWords() {
return this.importantWords;
}
public int getMAX_WORD_LENGTH() {
return this.MAX_WORD_LENGTH;
}
public int getMAX_POSITIONS_PER_WORD() {
return this.MAX_POSITIONS_PER_WORD;
}
public record DocumentWordSpan(HtmlTag tag, int start, int end) { public record DocumentWordSpan(HtmlTag tag, int start, int end) {
} }
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.keyword; package nu.marginalia.keyword;
import lombok.SneakyThrows;
import nu.marginalia.LanguageModels; import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.language.sentence.SentenceExtractor;
@ -13,6 +12,7 @@ import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.util.*; import java.util.*;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -25,8 +25,7 @@ class SentenceExtractorTest {
static SentenceExtractor se = new SentenceExtractor(lm); static SentenceExtractor se = new SentenceExtractor(lm);
@SneakyThrows public static void main(String... args) throws IOException, URISyntaxException {
public static void main(String... args) throws IOException {
final LanguageModels lm = TestLanguageModels.getLanguageModels(); final LanguageModels lm = TestLanguageModels.getLanguageModels();
var data = WmsaHome.getHomePath().resolve("test-data/"); var data = WmsaHome.getHomePath().resolve("test-data/");

View File

@ -1,7 +1,6 @@
package nu.marginalia.keyword.extractors; package nu.marginalia.keyword.extractors;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.keyword.KeywordExtractor;
@ -10,13 +9,14 @@ import nu.marginalia.test.util.TestLanguageModels;
import org.jsoup.Jsoup; import org.jsoup.Jsoup;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.nio.charset.Charset; import java.nio.charset.Charset;
import java.util.Collections; import java.util.Collections;
import java.util.Objects; import java.util.Objects;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*; import static org.junit.jupiter.api.Assertions.assertEquals;
class NameLikeKeywordsTest { class NameLikeKeywordsTest {
String text = """ String text = """
@ -58,8 +58,7 @@ class NameLikeKeywordsTest {
} }
@Test @Test
@SneakyThrows public void testWikiArticle() throws IOException {
public void testWikiArticle() {
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"), var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
"Could not load word frequency table"); "Could not load word frequency table");
String html = new String(resource.readAllBytes(), Charset.defaultCharset()); String html = new String(resource.readAllBytes(), Charset.defaultCharset());
@ -75,7 +74,6 @@ class NameLikeKeywordsTest {
} }
@Test @Test
@SneakyThrows
public void testWikiArticleP1() { public void testWikiArticleP1() {
String html = """ String html = """
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for clientserver web applications, with a reported 9 million developers.</p> <p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for clientserver web applications, with a reported 9 million developers.</p>

View File

@ -7,6 +7,7 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import nu.marginalia.test.util.TestLanguageModels; import nu.marginalia.test.util.TestLanguageModels;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.util.Collections; import java.util.Collections;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -41,7 +42,7 @@ class SubjectLikeKeywordsTest {
"""; """;
@Test @Test
public void test() { public void test() throws IOException {
var lm = TestLanguageModels.getLanguageModels(); var lm = TestLanguageModels.getLanguageModels();
var dict = new TermFrequencyDict(lm); var dict = new TermFrequencyDict(lm);

View File

@ -1,15 +1,12 @@
package nu.marginalia.converting.model; package nu.marginalia.converting.model;
import lombok.Getter;
import lombok.ToString;
import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
import nu.marginalia.model.crawl.UrlIndexingState;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.UrlIndexingState;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.OptionalDouble; import java.util.OptionalDouble;
@ToString @Getter
public class ProcessedDocument { public class ProcessedDocument {
public EdgeUrl url; public EdgeUrl url;
@ -41,4 +38,30 @@ public class ProcessedDocument {
} }
return OptionalDouble.empty(); return OptionalDouble.empty();
} }
public EdgeUrl getUrl() {
return this.url;
}
@Nullable
public ProcessedDocumentDetails getDetails() {
return this.details;
}
@Nullable
public DocumentKeywordsBuilder getWords() {
return this.words;
}
public UrlIndexingState getState() {
return this.state;
}
public String getStateReason() {
return this.stateReason;
}
public String toString() {
return "ProcessedDocument(url=" + this.getUrl() + ", details=" + this.getDetails() + ", words=" + this.getWords() + ", state=" + this.getState() + ", stateReason=" + this.getStateReason() + ")";
}
} }

View File

@ -1,16 +1,14 @@
package nu.marginalia.converting.model; package nu.marginalia.converting.model;
import lombok.ToString; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.html.HtmlStandard;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.EdgeUrl;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
@ToString
public class ProcessedDocumentDetails { public class ProcessedDocumentDetails {
public String title; public String title;
public String description; public String description;
@ -31,4 +29,8 @@ public class ProcessedDocumentDetails {
public DocumentMetadata metadata; public DocumentMetadata metadata;
public GeneratorType generator; public GeneratorType generator;
public String toString() {
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", feedLinks=" + this.feedLinks + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
}
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.model; package nu.marginalia.converting.model;
import lombok.ToString;
import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWritableIf;
import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeDomain;
@ -11,8 +10,7 @@ import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Optional; import java.util.Optional;
@ToString public class ProcessedDomain implements ConverterBatchWritableIf {
public class ProcessedDomain implements ConverterBatchWritableIf {
public EdgeDomain domain; public EdgeDomain domain;
public List<ProcessedDocument> documents; public List<ProcessedDocument> documents;
@ -21,8 +19,10 @@ public class ProcessedDomain implements ConverterBatchWritableIf {
public String ip; public String ip;
/** Used by the sideloader to give advice on how many documents are crawled /**
* without actually having to count (which would take forever) */ * Used by the sideloader to give advice on how many documents are crawled
* without actually having to count (which would take forever)
*/
@Nullable @Nullable
public Integer sizeloadSizeAdvice; public Integer sizeloadSizeAdvice;
@ -41,5 +41,10 @@ public class ProcessedDomain implements ConverterBatchWritableIf {
} }
@Override @Override
public void close() {} public void close() {
}
public String toString() {
return "ProcessedDomain(domain=" + this.domain + ", documents=" + this.documents + ", state=" + this.state + ", redirect=" + this.redirect + ", ip=" + this.ip + ", sizeloadSizeAdvice=" + this.sizeloadSizeAdvice + ")";
}
} }

View File

@ -1,7 +1,6 @@
package nu.marginalia.converting.processor; package nu.marginalia.converting.processor;
import com.google.inject.Inject; import com.google.inject.Inject;
import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSource;
@ -164,57 +163,62 @@ public class DomainProcessor {
} }
@SneakyThrows
@Nullable @Nullable
public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) { public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) {
if (!dataStream.hasNext()) { try {
return null; if (!dataStream.hasNext()) {
} return null;
}
List<ProcessedDocument> docs = new ArrayList<>(); List<ProcessedDocument> docs = new ArrayList<>();
Set<String> processedUrls = new HashSet<>(); Set<String> processedUrls = new HashSet<>();
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) { if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName()); throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
} }
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain()); DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
DocumentDecorator documentDecorator = new DocumentDecorator(); DocumentDecorator documentDecorator = new DocumentDecorator();
// Process Domain Record // Process Domain Record
ProcessedDomain ret = new ProcessedDomain(); ProcessedDomain ret = new ProcessedDomain();
processDomain(crawledDomain, ret, documentDecorator); processDomain(crawledDomain, ret, documentDecorator);
ret.documents = docs; ret.documents = docs;
// Process Documents // Process Documents
try (var deduplicator = new LshDocumentDeduplicator()) { try (var deduplicator = new LshDocumentDeduplicator()) {
while (dataStream.hasNext()) { while (dataStream.hasNext()) {
if (!(dataStream.next() instanceof CrawledDocument doc)) if (!(dataStream.next() instanceof CrawledDocument doc))
continue; continue;
if (doc.url == null) if (doc.url == null)
continue; continue;
if (doc.documentBody.isBlank()) if (doc.documentBody.isBlank())
continue; continue;
if (!processedUrls.add(doc.url)) if (!processedUrls.add(doc.url))
continue; continue;
try { try {
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator); var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
deduplicator.markIfDuplicate(processedDoc); deduplicator.markIfDuplicate(processedDoc);
docs.add(processedDoc); docs.add(processedDoc);
} catch (Exception ex) { } catch (Exception ex) {
logger.warn("Failed to process " + doc.url, ex); logger.warn("Failed to process " + doc.url, ex);
}
} }
} }
// Add late keywords and features from domain-level information
calculateStatistics(ret, externalDomainLinks);
return ret;
}
catch (Exception ex) {
logger.warn("Failed to process domain", ex);
return null;
} }
// Add late keywords and features from domain-level information
calculateStatistics(ret, externalDomainLinks);
return ret;
} }
private void processDomain(CrawledDomain crawledDomain, private void processDomain(CrawledDomain crawledDomain,

View File

@ -1,13 +1,11 @@
package nu.marginalia.converting.processor.logic.links; package nu.marginalia.converting.processor.logic.links;
import lombok.Getter;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import java.util.*; import java.util.*;
import java.util.function.BiConsumer; import java.util.function.BiConsumer;
@Getter
public class LinkGraph { public class LinkGraph {
private final Map<EdgeUrl, Set<EdgeUrl>> graph = new HashMap<>(1000); private final Map<EdgeUrl, Set<EdgeUrl>> graph = new HashMap<>(1000);

View File

@ -4,7 +4,6 @@ import com.google.gson.Gson;
import com.google.gson.GsonBuilder; import com.google.gson.GsonBuilder;
import com.google.gson.JsonSyntaxException; import com.google.gson.JsonSyntaxException;
import com.google.gson.annotations.SerializedName; import com.google.gson.annotations.SerializedName;
import lombok.ToString;
import nu.marginalia.converting.model.DocumentHeaders; import nu.marginalia.converting.model.DocumentHeaders;
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
@ -77,7 +76,6 @@ class JsonModel {
List<JsonModelGraphItem> graph; List<JsonModelGraphItem> graph;
} }
@ToString
class JsonModelGraphItem { class JsonModelGraphItem {
@SerializedName("@type") @SerializedName("@type")
public String type; public String type;
@ -88,5 +86,9 @@ class JsonModelGraphItem {
return "NewsArticle".equalsIgnoreCase(type) return "NewsArticle".equalsIgnoreCase(type)
|| "Article".equalsIgnoreCase(type); || "Article".equalsIgnoreCase(type);
} }
public String toString() {
return "JsonModelGraphItem(type=" + this.type + ", datePublished=" + this.datePublished + ")";
}
} }

View File

@ -1,20 +1,62 @@
package nu.marginalia.converting.sideload.dirtree; package nu.marginalia.converting.sideload.dirtree;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.util.List; import java.util.List;
@AllArgsConstructor
@NoArgsConstructor
@Setter
@Getter
class DirtreeSideloadSpec { class DirtreeSideloadSpec {
public String name; public String name;
public String domainName; public String domainName;
public String dir; public String dir;
public String baseUrl; public String baseUrl;
public List<String> keywords; public List<String> keywords;
public DirtreeSideloadSpec(String name, String domainName, String dir, String baseUrl, List<String> keywords) {
this.name = name;
this.domainName = domainName;
this.dir = dir;
this.baseUrl = baseUrl;
this.keywords = keywords;
}
public DirtreeSideloadSpec() {
}
public String getName() {
return this.name;
}
public String getDomainName() {
return this.domainName;
}
public String getDir() {
return this.dir;
}
public String getBaseUrl() {
return this.baseUrl;
}
public List<String> getKeywords() {
return this.keywords;
}
public void setName(String name) {
this.name = name;
}
public void setDomainName(String domainName) {
this.domainName = domainName;
}
public void setDir(String dir) {
this.dir = dir;
}
public void setBaseUrl(String baseUrl) {
this.baseUrl = baseUrl;
}
public void setKeywords(List<String> keywords) {
this.keywords = keywords;
}
} }

View File

@ -1,14 +1,22 @@
package nu.marginalia.converting.sideload.dirtree; package nu.marginalia.converting.sideload.dirtree;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.NoArgsConstructor;
import lombok.Setter;
import java.util.List; import java.util.List;
@AllArgsConstructor @NoArgsConstructor
@Setter @Getter
class DirtreeSideloadSpecList { class DirtreeSideloadSpecList {
public List<DirtreeSideloadSpec> sources; public List<DirtreeSideloadSpec> sources;
public DirtreeSideloadSpecList(List<DirtreeSideloadSpec> sources) {
this.sources = sources;
}
public DirtreeSideloadSpecList() {
}
public List<DirtreeSideloadSpec> getSources() {
return this.sources;
}
public void setSources(List<DirtreeSideloadSpec> sources) {
this.sources = sources;
}
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.sideload.dirtree; package nu.marginalia.converting.sideload.dirtree;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
@ -13,6 +12,7 @@ import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.DomainIndexingState;
import java.io.IOException; import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.time.LocalDate; import java.time.LocalDate;
@ -72,24 +72,28 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
return name.endsWith(".html") || name.endsWith(".htm"); return name.endsWith(".html") || name.endsWith(".htm");
} }
@SneakyThrows
private ProcessedDocument process(Path path) { private ProcessedDocument process(Path path) {
String body = Files.readString(path); try {
String url = urlBase + dirBase.relativize(path); String body = Files.readString(path);
String url = urlBase + dirBase.relativize(path);
// We trim "/index.html"-suffixes from the index if they are present, // We trim "/index.html"-suffixes from the index if they are present,
// since this is typically an artifact from document retrieval // since this is typically an artifact from document retrieval
if (url.endsWith("/index.html")) { if (url.endsWith("/index.html")) {
url = url.substring(0, url.length() - "index.html".length()); url = url.substring(0, url.length() - "index.html".length());
}
return sideloaderProcessing
.processDocument(url, body, extraKeywords, new DomainLinks(),
GeneratorType.DOCS,
DocumentClass.NORMAL,
new LinkTexts(),
LocalDate.now().getYear(),
10_000);
}
catch (IOException | URISyntaxException e) {
throw new RuntimeException(e);
} }
return sideloaderProcessing
.processDocument(url, body, extraKeywords, new DomainLinks(),
GeneratorType.DOCS,
DocumentClass.NORMAL,
new LinkTexts(),
LocalDate.now().getYear(),
10_000);
} }
@Override @Override

View File

@ -3,7 +3,6 @@ package nu.marginalia.converting.sideload.encyclopedia;
import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdInputStream;
import com.google.common.base.Charsets; import com.google.common.base.Charsets;
import com.google.gson.Gson; import com.google.gson.Gson;
import lombok.SneakyThrows;
import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.AnchorTextKeywords;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.atags.source.AnchorTagsSourceFactory;
@ -78,7 +77,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
return ret; return ret;
} }
@SneakyThrows
@Override @Override
public Iterator<ProcessedDocument> getDocumentsStream() { public Iterator<ProcessedDocument> getDocumentsStream() {
// This leaks a thread pool, but it doesn't matter since this is a one-off process // This leaks a thread pool, but it doesn't matter since this is a one-off process

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.sideload.stackexchange; package nu.marginalia.converting.sideload.stackexchange;
import lombok.SneakyThrows;
import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.GeneratorType;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.model.ProcessedDocumentDetails;
@ -39,15 +38,19 @@ public class StackexchangeSideloader implements SideloadSource {
private final Path dbFile; private final Path dbFile;
@SneakyThrows
public StackexchangeSideloader(Path pathToDbFile, public StackexchangeSideloader(Path pathToDbFile,
ThreadLocalSentenceExtractorProvider sentenceExtractorProvider, ThreadLocalSentenceExtractorProvider sentenceExtractorProvider,
DocumentKeywordExtractor keywordExtractor DocumentKeywordExtractor keywordExtractor
) { ) {
this.dbFile = pathToDbFile; try {
this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile); this.dbFile = pathToDbFile;
this.sentenceExtractorProvider = sentenceExtractorProvider; this.domainName = StackExchangePostsDb.getDomainName(pathToDbFile);
this.keywordExtractor = keywordExtractor; this.sentenceExtractorProvider = sentenceExtractorProvider;
this.keywordExtractor = keywordExtractor;
}
catch (Exception e) {
throw new RuntimeException(e);
}
} }
@Override @Override
@ -80,12 +83,16 @@ public class StackexchangeSideloader implements SideloadSource {
ProcessedDocument nextModel = null; ProcessedDocument nextModel = null;
@SneakyThrows
@Override @Override
public boolean hasNext() { public boolean hasNext() {
if (nextModel != null) if (nextModel != null)
return true; return true;
nextModel = postsReader.next(); try {
nextModel = postsReader.next();
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
return nextModel != null; return nextModel != null;
} }
@ -103,7 +110,6 @@ public class StackexchangeSideloader implements SideloadSource {
}; };
} }
@SneakyThrows
private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) { private ProcessedDocument convert(StackExchangePostsDb.CombinedPostModel post) {
String fullUrl = "https://" + domainName + "/questions/" + post.threadId(); String fullUrl = "https://" + domainName + "/questions/" + post.threadId();
@ -163,7 +169,7 @@ public class StackexchangeSideloader implements SideloadSource {
ret.stateReason = "SIDELOAD"; ret.stateReason = "SIDELOAD";
} }
catch (Exception e) { catch (Exception e) {
ret.url = new EdgeUrl(fullUrl); ret.url = EdgeUrl.parse(fullUrl).orElseThrow();
ret.state = UrlIndexingState.DISQUALIFIED; ret.state = UrlIndexingState.DISQUALIFIED;
ret.stateReason = "SIDELOAD"; ret.stateReason = "SIDELOAD";
} }
@ -186,9 +192,14 @@ public class StackexchangeSideloader implements SideloadSource {
} }
} }
@SneakyThrows
private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) { private boolean enqueue(StackExchangePostsDb.CombinedPostModel model) {
pool.submit(() -> results.put(convert(model))); try {
pool.submit(() -> results.put(convert(model)));
}
catch (InterruptedException e) {
Thread.currentThread().interrupt();
return false;
}
return true; return true;
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.sideload.warc; package nu.marginalia.converting.sideload.warc;
import lombok.SneakyThrows;
import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.model.DomainLinks;
import nu.marginalia.contenttype.ContentTypeParser; import nu.marginalia.contenttype.ContentTypeParser;
import nu.marginalia.contenttype.DocumentBodyToString; import nu.marginalia.contenttype.DocumentBodyToString;
@ -38,17 +37,20 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
private final EdgeDomain domain; private final EdgeDomain domain;
@SneakyThrows
public WarcSideloader(Path warcFile, public WarcSideloader(Path warcFile,
SideloaderProcessing sideloaderProcessing) SideloaderProcessing sideloaderProcessing)
{ {
this.sideloaderProcessing = sideloaderProcessing; try {
this.reader = new WarcReader(warcFile); this.sideloaderProcessing = sideloaderProcessing;
this.domain = sniffDomainFromWarc() this.reader = new WarcReader(warcFile);
.orElseThrow(() -> new IOException("Could not identify domain from warc file")); this.domain = sniffDomainFromWarc()
.orElseThrow(() -> new IOException("Could not identify domain from warc file"));
}
catch (IOException e) {
throw new RuntimeException(e);
}
} }
@SneakyThrows
@Override @Override
public ProcessedDomain getDomain() { public ProcessedDomain getDomain() {
var ret = new ProcessedDomain(); var ret = new ProcessedDomain();
@ -81,7 +83,6 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
return Optional.empty(); return Optional.empty();
} }
@SneakyThrows
@Override @Override
public Iterator<ProcessedDocument> getDocumentsStream() { public Iterator<ProcessedDocument> getDocumentsStream() {
return reader.records() return reader.records()
@ -111,13 +112,12 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
return true; return true;
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); logger.warn("Failed to process response", e);
} }
return false; return false;
} }
@SneakyThrows
private Optional<ProcessedDocument> process(WarcResponse response) { private Optional<ProcessedDocument> process(WarcResponse response) {
Optional<String> body = getBody(response); Optional<String> body = getBody(response);
String url = response.target(); String url = response.target();
@ -132,33 +132,46 @@ public class WarcSideloader implements SideloadSource, AutoCloseable {
return Optional.empty(); return Optional.empty();
} }
return Optional.of(sideloaderProcessing try {
.processDocument(url, return Optional.of(sideloaderProcessing
body.get(), .processDocument(url,
List.of(), body.get(),
new DomainLinks(), List.of(),
GeneratorType.DOCS, new DomainLinks(),
DocumentClass.SIDELOAD, GeneratorType.DOCS,
new LinkTexts(), DocumentClass.SIDELOAD,
LocalDate.now().getYear(), // TODO: This should be the actual year of the document new LinkTexts(),
10_000)); LocalDate.now().getYear(), // TODO: This should be the actual year of the document
10_000));
}
catch (Exception e) {
logger.warn("Failed to process document", e);
return Optional.empty();
}
} }
@SneakyThrows
private Optional<String> getBody(WarcResponse response) { private Optional<String> getBody(WarcResponse response) {
var http = response.http();
// TODO: We should support additional encodings here try {
try (var body = http.body()) { var http = response.http();
String contentType = http.headers().first("Content-Type").orElse(null);
byte[] bytes = body.stream().readAllBytes();
var ct = ContentTypeParser.parseContentType(contentType, bytes);
return Optional.of(DocumentBodyToString.getStringData(ct, bytes)); // TODO: We should support additional encodings here
try (var body = http.body()) {
String contentType = http.headers().first("Content-Type").orElse(null);
byte[] bytes = body.stream().readAllBytes();
var ct = ContentTypeParser.parseContentType(contentType, bytes);
return Optional.of(DocumentBodyToString.getStringData(ct, bytes));
}
catch (Exception ex) {
logger.info("Failed to parse body", ex);
}
} }
catch (Exception ex) { catch (Exception e) {
logger.info("Failed to parse body", ex); logger.warn("Failed to process response", e);
} }
return Optional.empty(); return Optional.empty();
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.writer; package nu.marginalia.converting.writer;
import lombok.SneakyThrows;
import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDocument;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloadSource;
@ -61,7 +60,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
} }
@Override @Override
@SneakyThrows
public void writeProcessedDomain(ProcessedDomain domain) { public void writeProcessedDomain(ProcessedDomain domain) {
try { try {
if (domain.documents != null) { if (domain.documents != null) {

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.writer; package nu.marginalia.converting.writer;
import lombok.SneakyThrows;
import nu.marginalia.worklog.BatchingWorkLog; import nu.marginalia.worklog.BatchingWorkLog;
import org.jetbrains.annotations.Nullable; import org.jetbrains.annotations.Nullable;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -40,48 +39,55 @@ public class ConverterWriter implements AutoCloseable {
workerThread.start(); workerThread.start();
} }
@SneakyThrows
public void accept(@Nullable ConverterBatchWritableIf domain) { public void accept(@Nullable ConverterBatchWritableIf domain) {
if (null == domain) if (null == domain)
return; return;
domainData.put(domain); try {
} domainData.put(domain);
}
@SneakyThrows catch (InterruptedException e) {
private void writerThread() { throw new RuntimeException(e);
IntervalAction switcher = new IntervalAction(this::switchBatch, switchInterval); }
}
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
private void writerThread() {
while (running || !domainData.isEmpty()) { try {
// poll with a timeout so we have an IntervalAction switcher = new IntervalAction(this::switchBatch, switchInterval);
// opportunity to check the running condition
// ... we could interrupt the thread as well, but currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
// as we enter third party code it's difficult to guarantee it will deal
// well with being interrupted while (running || !domainData.isEmpty()) {
var data = domainData.poll(1, TimeUnit.SECONDS); // poll with a timeout so we have an
// opportunity to check the running condition
if (data == null) // ... we could interrupt the thread as well, but
continue; // as we enter third party code it's difficult to guarantee it will deal
// well with being interrupted
String id = data.id(); var data = domainData.poll(1, TimeUnit.SECONDS);
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) { if (data == null)
logger.warn("Skipping already logged item {}", id); continue;
data.close();
continue; String id = data.id();
}
if (workLog.isItemCommitted(id) || workLog.isItemInCurrentBatch(id)) {
currentWriter.write(data); logger.warn("Skipping already logged item {}", id);
data.close();
workLog.logItem(id); continue;
}
switcher.tick();
currentWriter.write(data);
workLog.logItem(id);
switcher.tick();
}
}
catch (Exception ex) {
logger.error("Writer thread failed", ex);
} }
} }
@SneakyThrows
public boolean switchBatch() { public boolean switchBatch() {
if (workLog.isCurrentBatchEmpty()) { if (workLog.isCurrentBatchEmpty()) {
// Nothing to commit // Nothing to commit
@ -89,13 +95,18 @@ public class ConverterWriter implements AutoCloseable {
} }
// order matters here try {
currentWriter.close(); // order matters here
workLog.logFinishedBatch(); currentWriter.close();
logger.info("Switching to batch {}", workLog.getBatchNumber()); workLog.logFinishedBatch();
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber()); logger.info("Switching to batch {}", workLog.getBatchNumber());
currentWriter = new ConverterBatchWriter(basePath, workLog.getBatchNumber());
return true; return true;
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
} }
@Override @Override

View File

@ -1,7 +1,6 @@
package nu.marginalia.integration.reddit.db; package nu.marginalia.integration.reddit.db;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import lombok.SneakyThrows;
import nu.marginalia.integration.reddit.RedditEntryReader; import nu.marginalia.integration.reddit.RedditEntryReader;
import nu.marginalia.integration.reddit.model.ProcessableRedditComment; import nu.marginalia.integration.reddit.model.ProcessableRedditComment;
import nu.marginalia.integration.reddit.model.ProcessableRedditSubmission; import nu.marginalia.integration.reddit.model.ProcessableRedditSubmission;
@ -175,28 +174,35 @@ public class RedditDb {
stmt.close(); stmt.close();
} }
@SneakyThrows
@Override @Override
public boolean hasNext() { public boolean hasNext() {
if (hasNext != null) if (hasNext != null)
return hasNext; return hasNext;
hasNext = resultSet.next(); try {
hasNext = resultSet.next();
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
return hasNext; return hasNext;
} }
abstract T nextFromResultSet(ResultSet resultSet) throws SQLException; abstract T nextFromResultSet(ResultSet resultSet) throws SQLException;
@SneakyThrows
@Override @Override
public T next() { public T next() {
if (!hasNext()) if (!hasNext())
throw new IllegalStateException(); throw new IllegalStateException();
else hasNext = null; else hasNext = null;
return nextFromResultSet(resultSet); try {
return nextFromResultSet(resultSet);
}
catch (SQLException ex) {
throw new RuntimeException(ex);
}
} }
} }

View File

@ -1,12 +1,9 @@
package nu.marginalia.integration.reddit.model; package nu.marginalia.integration.reddit.model;
import lombok.AllArgsConstructor; /**
import lombok.ToString; * A projection of a Reddit comment joined with its top level submission
* that is ready for processing.
/** A projection of a Reddit comment joined with its top level submission */
* that is ready for processing. */
@AllArgsConstructor
@ToString
public class ProcessableRedditComment { public class ProcessableRedditComment {
public String subreddit; public String subreddit;
public String name; public String name;
@ -16,4 +13,19 @@ public class ProcessableRedditComment {
public int created_utc; public int created_utc;
public String permalink; public String permalink;
public int score; public int score;
public ProcessableRedditComment(String subreddit, String name, String author, String title, String body, int created_utc, String permalink, int score) {
this.subreddit = subreddit;
this.name = name;
this.author = author;
this.title = title;
this.body = body;
this.created_utc = created_utc;
this.permalink = permalink;
this.score = score;
}
public String toString() {
return "ProcessableRedditComment(subreddit=" + this.subreddit + ", name=" + this.name + ", author=" + this.author + ", title=" + this.title + ", body=" + this.body + ", created_utc=" + this.created_utc + ", permalink=" + this.permalink + ", score=" + this.score + ")";
}
} }

View File

@ -1,10 +1,8 @@
package nu.marginalia.integration.reddit.model; package nu.marginalia.integration.reddit.model;
import lombok.AllArgsConstructor; /**
import lombok.ToString; * A projection of a Reddit top level submission that is appropriate for processing.
*/
/** A projection of a Reddit top level submission that is appropriate for processing. */
@AllArgsConstructor @ToString
public class ProcessableRedditSubmission { public class ProcessableRedditSubmission {
public String subreddit; public String subreddit;
public String name; public String name;
@ -14,4 +12,19 @@ public class ProcessableRedditSubmission {
public int created_utc; public int created_utc;
public String permalink; public String permalink;
public int score; public int score;
public ProcessableRedditSubmission(String subreddit, String name, String author, String title, String selftext, int created_utc, String permalink, int score) {
this.subreddit = subreddit;
this.name = name;
this.author = author;
this.title = title;
this.selftext = selftext;
this.created_utc = created_utc;
this.permalink = permalink;
this.score = score;
}
public String toString() {
return "ProcessableRedditSubmission(subreddit=" + this.subreddit + ", name=" + this.name + ", author=" + this.author + ", title=" + this.title + ", selftext=" + this.selftext + ", created_utc=" + this.created_utc + ", permalink=" + this.permalink + ", score=" + this.score + ")";
}
} }

View File

@ -1,14 +1,9 @@
package nu.marginalia.integration.reddit.model; package nu.marginalia.integration.reddit.model;
import lombok.AllArgsConstructor; /**
import lombok.ToString; * Corresponds directly to the pushshift.io Reddit comment JSON format.
import lombok.With; */
/** Corresponds directly to the pushshift.io Reddit comment JSON format. */
@AllArgsConstructor
@ToString
@With
public class RawRedditComment { public class RawRedditComment {
public String parent_id; public String parent_id;
public String link_id; public String link_id;
@ -17,4 +12,46 @@ public class RawRedditComment {
public String body; public String body;
public String subreddit; public String subreddit;
public int score; public int score;
public RawRedditComment(String parent_id, String link_id, String id, String author, String body, String subreddit, int score) {
this.parent_id = parent_id;
this.link_id = link_id;
this.id = id;
this.author = author;
this.body = body;
this.subreddit = subreddit;
this.score = score;
}
public RawRedditComment withParent_id(String parent_id) {
return this.parent_id == parent_id ? this : new RawRedditComment(parent_id, this.link_id, this.id, this.author, this.body, this.subreddit, this.score);
}
public RawRedditComment withLink_id(String link_id) {
return this.link_id == link_id ? this : new RawRedditComment(this.parent_id, link_id, this.id, this.author, this.body, this.subreddit, this.score);
}
public RawRedditComment withId(String id) {
return this.id == id ? this : new RawRedditComment(this.parent_id, this.link_id, id, this.author, this.body, this.subreddit, this.score);
}
public RawRedditComment withAuthor(String author) {
return this.author == author ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, author, this.body, this.subreddit, this.score);
}
public RawRedditComment withBody(String body) {
return this.body == body ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, this.author, body, this.subreddit, this.score);
}
public RawRedditComment withSubreddit(String subreddit) {
return this.subreddit == subreddit ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, this.author, this.body, subreddit, this.score);
}
public RawRedditComment withScore(int score) {
return this.score == score ? this : new RawRedditComment(this.parent_id, this.link_id, this.id, this.author, this.body, this.subreddit, score);
}
public String toString() {
return "RawRedditComment(parent_id=" + this.parent_id + ", link_id=" + this.link_id + ", id=" + this.id + ", author=" + this.author + ", body=" + this.body + ", subreddit=" + this.subreddit + ", score=" + this.score + ")";
}
} }

View File

@ -1,14 +1,9 @@
package nu.marginalia.integration.reddit.model; package nu.marginalia.integration.reddit.model;
import lombok.AllArgsConstructor; /**
import lombok.ToString; * Corresponds directly to the pushshift.io Reddit submission JSON format.
import lombok.With; */
/** Corresponds directly to the pushshift.io Reddit submission JSON format. */
@AllArgsConstructor
@With
@ToString
public class RawRedditSubmission { public class RawRedditSubmission {
public int score; public int score;
public String subreddit; public String subreddit;
@ -19,4 +14,56 @@ public class RawRedditSubmission {
public int num_comments; public int num_comments;
public int created_utc; public int created_utc;
public String permalink; public String permalink;
public RawRedditSubmission(int score, String subreddit, String name, String author, String title, String selftext, int num_comments, int created_utc, String permalink) {
this.score = score;
this.subreddit = subreddit;
this.name = name;
this.author = author;
this.title = title;
this.selftext = selftext;
this.num_comments = num_comments;
this.created_utc = created_utc;
this.permalink = permalink;
}
public RawRedditSubmission withScore(int score) {
return this.score == score ? this : new RawRedditSubmission(score, this.subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withSubreddit(String subreddit) {
return this.subreddit == subreddit ? this : new RawRedditSubmission(this.score, subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withName(String name) {
return this.name == name ? this : new RawRedditSubmission(this.score, this.subreddit, name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withAuthor(String author) {
return this.author == author ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, author, this.title, this.selftext, this.num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withTitle(String title) {
return this.title == title ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, title, this.selftext, this.num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withSelftext(String selftext) {
return this.selftext == selftext ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, selftext, this.num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withNum_comments(int num_comments) {
return this.num_comments == num_comments ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, this.selftext, num_comments, this.created_utc, this.permalink);
}
public RawRedditSubmission withCreated_utc(int created_utc) {
return this.created_utc == created_utc ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, created_utc, this.permalink);
}
public RawRedditSubmission withPermalink(String permalink) {
return this.permalink == permalink ? this : new RawRedditSubmission(this.score, this.subreddit, this.name, this.author, this.title, this.selftext, this.num_comments, this.created_utc, permalink);
}
public String toString() {
return "RawRedditSubmission(score=" + this.score + ", subreddit=" + this.subreddit + ", name=" + this.name + ", author=" + this.author + ", title=" + this.title + ", selftext=" + this.selftext + ", num_comments=" + this.num_comments + ", created_utc=" + this.created_utc + ", permalink=" + this.permalink + ")";
}
} }

View File

@ -3,7 +3,6 @@ package nu.marginalia.integration.stackexchange.sqlite;
import com.github.luben.zstd.Zstd; import com.github.luben.zstd.Zstd;
import gnu.trove.list.TIntList; import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList; import gnu.trove.list.array.TIntArrayList;
import lombok.SneakyThrows;
import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader; import nu.marginalia.integration.stackexchange.xml.StackExchangeXmlPostReader;
import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamException;
@ -15,6 +14,7 @@ import java.sql.ResultSet;
import java.sql.SQLException; import java.sql.SQLException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.Future; import java.util.concurrent.Future;
import java.util.function.Predicate; import java.util.function.Predicate;
@ -32,10 +32,9 @@ import java.util.function.Predicate;
public class StackExchangePostsDb { public class StackExchangePostsDb {
/** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */ /** Construct a SQLIte file containing the Posts in the stack exchange-style 7z file */
@SneakyThrows
public static void create(String domain, public static void create(String domain,
Path sqliteFile, Path sqliteFile,
Path stackExchange7zFile) { Path stackExchange7zFile) throws IOException {
Files.deleteIfExists(sqliteFile); Files.deleteIfExists(sqliteFile);
String connStr = "jdbc:sqlite:" + sqliteFile; String connStr = "jdbc:sqlite:" + sqliteFile;
@ -115,7 +114,6 @@ public class StackExchangePostsDb {
* necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode * necessary as stackexchange's entry count exceeds the ~67 million entries that UrlIdCodec can encode
* for a single domain, despite having less than 67 million 'threads'. * for a single domain, despite having less than 67 million 'threads'.
* */ * */
@SneakyThrows
public static void forEachPost( public static void forEachPost(
Path sqliteFile, Path sqliteFile,
Predicate<CombinedPostModel> consumer) { Predicate<CombinedPostModel> consumer) {
@ -189,8 +187,8 @@ public class StackExchangePostsDb {
} }
} }
catch (SQLException ex) { catch (SQLException | InterruptedException | ExecutionException ex) {
ex.printStackTrace(); throw new RuntimeException(ex);
} }
} }

View File

@ -1,7 +1,5 @@
package nu.marginalia.integration.stackexchange.xml; package nu.marginalia.integration.stackexchange.xml;
import lombok.SneakyThrows;
import javax.xml.stream.XMLEventReader; import javax.xml.stream.XMLEventReader;
import javax.xml.stream.events.XMLEvent; import javax.xml.stream.events.XMLEvent;
import java.util.Iterator; import java.util.Iterator;
@ -22,25 +20,29 @@ class StackExchangeXmlIterator<T> implements Iterator<T> {
this.parser = parser; this.parser = parser;
} }
@SneakyThrows
@Override @Override
public boolean hasNext() { public boolean hasNext() {
if (next != null) if (next != null)
return true; return true;
while (xmlReader.hasNext()) { try {
XMLEvent event = xmlReader.nextEvent(); while (xmlReader.hasNext()) {
XMLEvent event = xmlReader.nextEvent();
if (!event.isStartElement()) if (!event.isStartElement())
continue; continue;
next = parser.apply(event); next = parser.apply(event);
if (next != null) if (next != null)
return true; return true;
}
readerSource.close();
}
catch (Exception ex) {
throw new RuntimeException(ex);
} }
readerSource.close();
return false; return false;
} }

View File

@ -1,6 +1,5 @@
package nu.marginalia.model.processed; package nu.marginalia.model.processed;
import lombok.Builder;
import nu.marginalia.sequence.VarintCodedSequence; import nu.marginalia.sequence.VarintCodedSequence;
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn; import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
import nu.marginalia.slop.SlopTable; import nu.marginalia.slop.SlopTable;
@ -52,7 +51,6 @@ public record SlopDocumentRecord(
throw new IllegalArgumentException("Metas, words and positions must have the same length"); throw new IllegalArgumentException("Metas, words and positions must have the same length");
} }
@Builder
public record KeywordsProjection( public record KeywordsProjection(
String domain, String domain,
int ordinal, int ordinal,
@ -63,8 +61,11 @@ public record SlopDocumentRecord(
byte[] metas, byte[] metas,
List<VarintCodedSequence> positions, List<VarintCodedSequence> positions,
byte[] spanCodes, byte[] spanCodes,
List<VarintCodedSequence> spans) List<VarintCodedSequence> spans) {
{ public static KeywordsProjectionBuilder builder() {
return new KeywordsProjectionBuilder();
}
// Override the equals method since records don't generate default equals that deal with array fields properly // Override the equals method since records don't generate default equals that deal with array fields properly
@Override @Override
public boolean equals(Object o) { public boolean equals(Object o) {
@ -88,6 +89,80 @@ public record SlopDocumentRecord(
result = 31 * result + Objects.hashCode(spans); result = 31 * result + Objects.hashCode(spans);
return result; return result;
} }
public static class KeywordsProjectionBuilder {
private String domain;
private int ordinal;
private int htmlFeatures;
private long documentMetadata;
private int length;
private List<String> words;
private byte[] metas;
private List<VarintCodedSequence> positions;
private byte[] spanCodes;
private List<VarintCodedSequence> spans;
KeywordsProjectionBuilder() {
}
public KeywordsProjectionBuilder domain(String domain) {
this.domain = domain;
return this;
}
public KeywordsProjectionBuilder ordinal(int ordinal) {
this.ordinal = ordinal;
return this;
}
public KeywordsProjectionBuilder htmlFeatures(int htmlFeatures) {
this.htmlFeatures = htmlFeatures;
return this;
}
public KeywordsProjectionBuilder documentMetadata(long documentMetadata) {
this.documentMetadata = documentMetadata;
return this;
}
public KeywordsProjectionBuilder length(int length) {
this.length = length;
return this;
}
public KeywordsProjectionBuilder words(List<String> words) {
this.words = words;
return this;
}
public KeywordsProjectionBuilder metas(byte[] metas) {
this.metas = metas;
return this;
}
public KeywordsProjectionBuilder positions(List<VarintCodedSequence> positions) {
this.positions = positions;
return this;
}
public KeywordsProjectionBuilder spanCodes(byte[] spanCodes) {
this.spanCodes = spanCodes;
return this;
}
public KeywordsProjectionBuilder spans(List<VarintCodedSequence> spans) {
this.spans = spans;
return this;
}
public KeywordsProjection build() {
return new KeywordsProjection(this.domain, this.ordinal, this.htmlFeatures, this.documentMetadata, this.length, this.words, this.metas, this.positions, this.spanCodes, this.spans);
}
public String toString() {
return "SlopDocumentRecord.KeywordsProjection.KeywordsProjectionBuilder(domain=" + this.domain + ", ordinal=" + this.ordinal + ", htmlFeatures=" + this.htmlFeatures + ", documentMetadata=" + this.documentMetadata + ", length=" + this.length + ", words=" + this.words + ", metas=" + Arrays.toString(this.metas) + ", positions=" + this.positions + ", spanCodes=" + Arrays.toString(this.spanCodes) + ", spans=" + this.spans + ")";
}
}
} }
public record MetadataProjection( public record MetadataProjection(

View File

@ -2,7 +2,6 @@ package nu.marginalia.converting;
import com.google.inject.Guice; import com.google.inject.Guice;
import com.google.inject.Injector; import com.google.inject.Injector;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.model.ProcessedDomain;
@ -48,16 +47,14 @@ public class CrawlingThenConvertingIntegrationTest {
private Path fileName; private Path fileName;
private Path fileName2; private Path fileName2;
@SneakyThrows
@BeforeAll @BeforeAll
public static void setUpAll() { public static void setUpAll() {
// this must be done to avoid java inserting its own user agent for the sitemap requests // this must be done to avoid java inserting its own user agent for the sitemap requests
System.setProperty("http.agent", WmsaHome.getUserAgent().uaString()); System.setProperty("http.agent", WmsaHome.getUserAgent().uaString());
} }
@SneakyThrows
@BeforeEach @BeforeEach
public void setUp() { public void setUp() throws IOException {
Injector injector = Guice.createInjector( Injector injector = Guice.createInjector(
new ConvertingIntegrationTestModule() new ConvertingIntegrationTestModule()
); );

View File

@ -1,6 +1,5 @@
package nu.marginalia.converting.processor.summary; package nu.marginalia.converting.processor.summary;
import lombok.SneakyThrows;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.converting.processor.summary.heuristic.*; import nu.marginalia.converting.processor.summary.heuristic.*;
import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.keyword.DocumentKeywordExtractor;
@ -36,8 +35,7 @@ class SummaryExtractorTest {
new FallbackHeuristic()); new FallbackHeuristic());
} }
@SneakyThrows Set<String> getImportantWords(Document doc) throws URISyntaxException {
Set<String> getImportantWords(Document doc) {
var dld = setenceExtractor.extractSentences(doc); var dld = setenceExtractor.extractSentences(doc);
var keywords = keywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl( var keywords = keywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl(
"https://www.marginalia.nu/" "https://www.marginalia.nu/"
@ -92,7 +90,7 @@ class SummaryExtractorTest {
} }
@Test @Test
void extractSurrey() throws IOException { void extractSurrey() throws IOException, URISyntaxException {
String html = readClassPathFile("html/summarization/surrey.html"); String html = readClassPathFile("html/summarization/surrey.html");
var doc = Jsoup.parse(html); var doc = Jsoup.parse(html);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc)); String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
@ -104,7 +102,7 @@ class SummaryExtractorTest {
} }
@Test @Test
void extractSurrey1() throws IOException { void extractSurrey1() throws IOException, URISyntaxException {
String html = readClassPathFile("html/summarization/surrey.html.1"); String html = readClassPathFile("html/summarization/surrey.html.1");
var doc = Jsoup.parse(html); var doc = Jsoup.parse(html);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc)); String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
@ -115,7 +113,7 @@ class SummaryExtractorTest {
} }
@Test @Test
void extract187() throws IOException { void extract187() throws IOException, URISyntaxException {
String html = readClassPathFile("html/summarization/187.shtml"); String html = readClassPathFile("html/summarization/187.shtml");
var doc = Jsoup.parse(html); var doc = Jsoup.parse(html);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc)); String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
@ -126,7 +124,7 @@ class SummaryExtractorTest {
} }
@Test @Test
void extractMonadnock() throws IOException { void extractMonadnock() throws IOException, URISyntaxException {
String html = readClassPathFile("html/monadnock.html"); String html = readClassPathFile("html/monadnock.html");
var doc = Jsoup.parse(html); var doc = Jsoup.parse(html);
@ -138,13 +136,16 @@ class SummaryExtractorTest {
} }
@Test @Test
public void testWorkSet() throws IOException { public void testWorkSet() throws IOException, URISyntaxException {
var workSet = readWorkSet(); var workSet = readWorkSet();
workSet.forEach((path, str) -> { for (Map.Entry<Path, String> entry : workSet.entrySet()) {
final Path path = entry.getKey();
final String str = entry.getValue();
var doc = Jsoup.parse(str); var doc = Jsoup.parse(str);
String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc)); String summary = summaryExtractor.extractSummary(doc, getImportantWords(doc));
System.out.println(path + ": " + summary); System.out.println(path + ": " + summary);
}); }
} }
private String readClassPathFile(String s) throws IOException { private String readClassPathFile(String s) throws IOException {
return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes()); return new String(Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(s)).readAllBytes());

View File

@ -2,7 +2,6 @@ package nu.marginalia.link_parser;
import com.google.common.base.CharMatcher; import com.google.common.base.CharMatcher;
import com.google.common.base.Strings; import com.google.common.base.Strings;
import lombok.SneakyThrows;
import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.EdgeUrl;
import nu.marginalia.util.QueryParams; import nu.marginalia.util.QueryParams;
import org.jetbrains.annotations.Contract; import org.jetbrains.annotations.Contract;
@ -122,14 +121,19 @@ public class LinkParser {
return Optional.ofNullable(matcher.group(1)); return Optional.ofNullable(matcher.group(1));
} }
@SneakyThrows
private URI renormalize(URI uri) { private URI renormalize(URI uri) {
if (uri.getPath() == null) { try {
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment())); if (uri.getPath() == null) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), "/", uri.getQuery(), uri.getFragment()));
}
if (uri.getPath().startsWith("/../")) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment()));
}
} }
if (uri.getPath().startsWith("/../")) { catch (URISyntaxException e) {
return renormalize(new URI(uri.getScheme(), uri.getHost(), uri.getPath().substring(3), uri.getQuery(), uri.getFragment())); logger.warn("Bad URI {}", uri);
} }
return uri; return uri;
} }
@ -146,7 +150,6 @@ public class LinkParser {
private static final Pattern spaceRegex = Pattern.compile(" "); private static final Pattern spaceRegex = Pattern.compile(" ");
private static final Pattern paramSeparatorPattern = Pattern.compile("\\?"); private static final Pattern paramSeparatorPattern = Pattern.compile("\\?");
@SneakyThrows
private String resolveRelativeUrl(EdgeUrl baseUrl, String s) { private String resolveRelativeUrl(EdgeUrl baseUrl, String s) {
// url looks like http://www.marginalia.nu/ // url looks like http://www.marginalia.nu/

View File

@ -5,7 +5,6 @@ import com.google.inject.Guice;
import com.google.inject.Inject; import com.google.inject.Inject;
import com.google.inject.Injector; import com.google.inject.Injector;
import com.zaxxer.hikari.HikariDataSource; import com.zaxxer.hikari.HikariDataSource;
import lombok.Builder;
import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfiguration;
import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
@ -475,7 +474,6 @@ public class CrawlerMain extends ProcessMainClass {
} }
} }
@Builder
public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) { public record CrawlSpecRecord(@NotNull String domain, int crawlDepth, @NotNull List<String> urls) {
public CrawlSpecRecord(String domain, int crawlDepth) { public CrawlSpecRecord(String domain, int crawlDepth) {
@ -487,13 +485,48 @@ public class CrawlerMain extends ProcessMainClass {
// already fetched, and a growth factor that gets a bonus for small domains // already fetched, and a growth factor that gets a bonus for small domains
return new CrawlSpecRecord(domain, return new CrawlSpecRecord(domain,
(int) Math.clamp( (int) Math.clamp(
(visitedUrls * (visitedUrls < MID_URLS_PER_DOMAIN (visitedUrls * (visitedUrls < MID_URLS_PER_DOMAIN
? Math.max(2.5, URL_GROWTH_FACTOR) ? Math.max(2.5, URL_GROWTH_FACTOR)
: URL_GROWTH_FACTOR) : URL_GROWTH_FACTOR)
), ),
MIN_URLS_PER_DOMAIN, MIN_URLS_PER_DOMAIN,
MAX_URLS_PER_DOMAIN)); MAX_URLS_PER_DOMAIN));
} }
public static CrawlSpecRecordBuilder builder() {
return new CrawlSpecRecordBuilder();
}
public static class CrawlSpecRecordBuilder {
private @NotNull String domain;
private int crawlDepth;
private @NotNull List<String> urls;
CrawlSpecRecordBuilder() {
}
public CrawlSpecRecordBuilder domain(@NotNull String domain) {
this.domain = domain;
return this;
}
public CrawlSpecRecordBuilder crawlDepth(int crawlDepth) {
this.crawlDepth = crawlDepth;
return this;
}
public CrawlSpecRecordBuilder urls(@NotNull List<String> urls) {
this.urls = urls;
return this;
}
public CrawlSpecRecord build() {
return new CrawlSpecRecord(this.domain, this.crawlDepth, this.urls);
}
public String toString() {
return "CrawlerMain.CrawlSpecRecord.CrawlSpecRecordBuilder(domain=" + this.domain + ", crawlDepth=" + this.crawlDepth + ", urls=" + this.urls + ")";
}
}
} }
} }

View File

@ -2,13 +2,12 @@ package nu.marginalia.crawl;
import com.google.gson.Gson; import com.google.gson.Gson;
import com.google.inject.AbstractModule; import com.google.inject.AbstractModule;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.WmsaHome; import nu.marginalia.WmsaHome;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
public class CrawlerModule extends AbstractModule { public class CrawlerModule extends AbstractModule {
@SneakyThrows
public void configure() { public void configure() {
bind(Gson.class).toInstance(createGson()); bind(Gson.class).toInstance(createGson());
bind(UserAgent.class).toInstance(WmsaHome.getUserAgent()); bind(UserAgent.class).toInstance(WmsaHome.getUserAgent());

View File

@ -27,7 +27,7 @@ public interface HttpFetcher {
HttpFetchResult fetchContent(EdgeUrl url, HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder recorder, WarcRecorder recorder,
ContentTags tags, ContentTags tags,
ProbeType probeType) throws HttpFetcherImpl.RateLimitException; ProbeType probeType) throws HttpFetcherImpl.RateLimitException, Exception;
SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder); SimpleRobotRules fetchRobotRules(EdgeDomain domain, WarcRecorder recorder);

View File

@ -3,7 +3,6 @@ package nu.marginalia.crawl.fetcher;
import com.google.inject.Inject; import com.google.inject.Inject;
import crawlercommons.robots.SimpleRobotRules; import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser; import crawlercommons.robots.SimpleRobotRulesParser;
import lombok.SneakyThrows;
import nu.marginalia.UserAgent; import nu.marginalia.UserAgent;
import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory; import nu.marginalia.crawl.fetcher.socket.FastTerminatingSocketFactory;
import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.fetcher.socket.IpInterceptingNetworkInterceptor;
@ -50,7 +49,6 @@ public class HttpFetcherImpl implements HttpFetcher {
private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory(); private static final FastTerminatingSocketFactory ftSocketFactory = new FastTerminatingSocketFactory();
@SneakyThrows
private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) { private OkHttpClient createClient(Dispatcher dispatcher, ConnectionPool pool) {
var builder = new OkHttpClient.Builder(); var builder = new OkHttpClient.Builder();
if (dispatcher != null) { if (dispatcher != null) {
@ -111,7 +109,6 @@ public class HttpFetcherImpl implements HttpFetcher {
* @return The result of the probe, indicating the state and the URL. * @return The result of the probe, indicating the state and the URL.
*/ */
@Override @Override
@SneakyThrows
public DomainProbeResult probeDomain(EdgeUrl url) { public DomainProbeResult probeDomain(EdgeUrl url) {
var head = new Request.Builder().head().addHeader("User-agent", userAgentString) var head = new Request.Builder().head().addHeader("User-agent", userAgentString)
.url(url.toString()) .url(url.toString())
@ -207,11 +204,11 @@ public class HttpFetcherImpl implements HttpFetcher {
* the outcome of the fetch. * the outcome of the fetch.
*/ */
@Override @Override
@SneakyThrows
public HttpFetchResult fetchContent(EdgeUrl url, public HttpFetchResult fetchContent(EdgeUrl url,
WarcRecorder warcRecorder, WarcRecorder warcRecorder,
ContentTags contentTags, ContentTags contentTags,
ProbeType probeType) ProbeType probeType)
throws Exception
{ {
var getBuilder = new Request.Builder().get(); var getBuilder = new Request.Builder().get();

View File

@ -1,7 +1,5 @@
package nu.marginalia.crawl.fetcher.socket; package nu.marginalia.crawl.fetcher.socket;
import lombok.SneakyThrows;
import javax.net.ssl.*; import javax.net.ssl.*;
import java.security.cert.X509Certificate; import java.security.cert.X509Certificate;
@ -29,20 +27,24 @@ public class NoSecuritySSL {
} }
}; };
@SneakyThrows
public static SSLSocketFactory buildSocketFactory() { public static SSLSocketFactory buildSocketFactory() {
// Install the all-trusting trust manager try {
final SSLContext sslContext = SSLContext.getInstance("TLS"); // Install the all-trusting trust manager
sslContext.init(null, trustAllCerts, new java.security.SecureRandom()); final SSLContext sslContext = SSLContext.getInstance("TLS");
sslContext.init(null, trustAllCerts, new java.security.SecureRandom());
var clientSessionContext = sslContext.getClientSessionContext(); var clientSessionContext = sslContext.getClientSessionContext();
// The default value for this is very high and will use a crapload of memory // The default value for this is very high and will use a crapload of memory
// since the crawler will be making a lot of requests to various hosts // since the crawler will be making a lot of requests to various hosts
clientSessionContext.setSessionCacheSize(2048); clientSessionContext.setSessionCacheSize(2048);
// Create a ssl socket factory with our all-trusting manager // Create a ssl socket factory with our all-trusting manager
return sslContext.getSocketFactory(); return sslContext.getSocketFactory();
}
catch (Exception e) {
throw new RuntimeException(e);
}
} }
public static HostnameVerifier buildHostnameVerifyer() { public static HostnameVerifier buildHostnameVerifyer() {

Some files were not shown because too many files have changed in this diff Show More