mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request #124 from MarginaliaSearch/jdk-23+delombok
Friendship with lombok over, now JDK 23 is my best friend
This commit is contained in:
commit
34258b92d1
@ -1,7 +1,6 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id("org.jetbrains.gradle.plugin.idea-ext") version "1.0"
|
||||
id "io.freefair.lombok" version "8.3"
|
||||
id "me.champeau.jmh" version "0.6.6"
|
||||
|
||||
// This is a workaround for a bug in the Jib plugin that causes it to stall randomly
|
||||
@ -44,8 +43,8 @@ subprojects.forEach {it ->
|
||||
}
|
||||
|
||||
ext {
|
||||
jvmVersion=22
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:22'
|
||||
jvmVersion=23
|
||||
dockerImageBase='container-registry.oracle.com/graalvm/jdk:23'
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
|
@ -1,10 +1,7 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import lombok.Builder;
|
||||
|
||||
import java.nio.file.Path;
|
||||
|
||||
@Builder
|
||||
public class LanguageModels {
|
||||
public final Path termFrequencies;
|
||||
|
||||
@ -30,4 +27,64 @@ public class LanguageModels {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
this.segments = segments;
|
||||
}
|
||||
|
||||
public static LanguageModelsBuilder builder() {
|
||||
return new LanguageModelsBuilder();
|
||||
}
|
||||
|
||||
public static class LanguageModelsBuilder {
|
||||
private Path termFrequencies;
|
||||
private Path openNLPSentenceDetectionData;
|
||||
private Path posRules;
|
||||
private Path posDict;
|
||||
private Path openNLPTokenData;
|
||||
private Path fasttextLanguageModel;
|
||||
private Path segments;
|
||||
|
||||
LanguageModelsBuilder() {
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder termFrequencies(Path termFrequencies) {
|
||||
this.termFrequencies = termFrequencies;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder openNLPSentenceDetectionData(Path openNLPSentenceDetectionData) {
|
||||
this.openNLPSentenceDetectionData = openNLPSentenceDetectionData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posRules(Path posRules) {
|
||||
this.posRules = posRules;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder posDict(Path posDict) {
|
||||
this.posDict = posDict;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder openNLPTokenData(Path openNLPTokenData) {
|
||||
this.openNLPTokenData = openNLPTokenData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder fasttextLanguageModel(Path fasttextLanguageModel) {
|
||||
this.fasttextLanguageModel = fasttextLanguageModel;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModelsBuilder segments(Path segments) {
|
||||
this.segments = segments;
|
||||
return this;
|
||||
}
|
||||
|
||||
public LanguageModels build() {
|
||||
return new LanguageModels(this.termFrequencies, this.openNLPSentenceDetectionData, this.posRules, this.posDict, this.openNLPTokenData, this.fasttextLanguageModel, this.segments);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "LanguageModels.LanguageModelsBuilder(termFrequencies=" + this.termFrequencies + ", openNLPSentenceDetectionData=" + this.openNLPSentenceDetectionData + ", posRules=" + this.posRules + ", posDict=" + this.posDict + ", openNLPTokenData=" + this.openNLPTokenData + ", fasttextLanguageModel=" + this.fasttextLanguageModel + ", segments=" + this.segments + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -7,12 +7,13 @@ import com.google.common.util.concurrent.UncheckedExecutionException;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
import java.sql.SQLException;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.OptionalInt;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
@Singleton
|
||||
public class DbDomainQueries {
|
||||
@ -27,7 +28,6 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@ -42,12 +42,14 @@ public class DbDomainQueries {
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw ex.getCause();
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public OptionalInt tryGetDomainId(EdgeDomain domain) {
|
||||
|
||||
Integer maybeId = domainIdCache.getIfPresent(domain);
|
||||
@ -70,11 +72,13 @@ public class DbDomainQueries {
|
||||
return OptionalInt.empty();
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
return OptionalInt.empty();
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Optional<EdgeDomain> getDomain(int id) {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
@ -87,5 +91,11 @@ public class DbDomainQueries {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.With;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -115,23 +114,23 @@ public class DomainRankingSetsService {
|
||||
}
|
||||
}
|
||||
|
||||
/** Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
/**
|
||||
* Defines a domain ranking set, parameters for the ranking algorithms.
|
||||
*
|
||||
* @param name Key and name of the set
|
||||
* @param name Key and name of the set
|
||||
* @param description Human-readable description
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
* */
|
||||
@With
|
||||
* @param depth Depth of the algorithm
|
||||
* @param definition Definition of the set, typically a list of domains or globs for domain-names
|
||||
*/
|
||||
public record DomainRankingSet(String name,
|
||||
String description,
|
||||
int depth,
|
||||
String definition)
|
||||
{
|
||||
String definition) {
|
||||
|
||||
public Path fileName(Path base) {
|
||||
return base.resolve(name().toLowerCase() + ".dat");
|
||||
}
|
||||
|
||||
public String[] domains() {
|
||||
return Arrays.stream(definition().split("\n+"))
|
||||
.map(String::trim)
|
||||
@ -144,5 +143,20 @@ public class DomainRankingSetsService {
|
||||
return name().equals("BLOGS") || name().equals("NONE") || name().equals("RANK");
|
||||
}
|
||||
|
||||
public DomainRankingSet withName(String name) {
|
||||
return this.name == name ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDescription(String description) {
|
||||
return this.description == description ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDepth(int depth) {
|
||||
return this.depth == depth ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
|
||||
public DomainRankingSet withDefinition(String definition) {
|
||||
return this.definition == definition ? this : new DomainRankingSet(name, description, depth, definition);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,15 +1,11 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.*;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeDomain implements Serializable {
|
||||
|
||||
@Nonnull
|
||||
@ -17,7 +13,6 @@ public class EdgeDomain implements Serializable {
|
||||
@Nonnull
|
||||
public final String topDomain;
|
||||
|
||||
@SneakyThrows
|
||||
public EdgeDomain(String host) {
|
||||
Objects.requireNonNull(host, "domain name must not be null");
|
||||
|
||||
@ -34,28 +29,23 @@ public class EdgeDomain implements Serializable {
|
||||
if (dot < 0 || looksLikeAnIp(host)) { // IPV6 >.>
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
int dot2 = host.substring(0, dot).lastIndexOf('.');
|
||||
if (dot2 < 0) {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
else {
|
||||
if (looksLikeGovTld(host))
|
||||
{ // Capture .ac.jp, .co.uk
|
||||
} else {
|
||||
if (looksLikeGovTld(host)) { // Capture .ac.jp, .co.uk
|
||||
int dot3 = host.substring(0, dot2).lastIndexOf('.');
|
||||
if (dot3 >= 0) {
|
||||
dot2 = dot3;
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = "";
|
||||
topDomain = host;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
subDomain = host.substring(0, dot2);
|
||||
topDomain = host.substring(dot2 + 1);
|
||||
}
|
||||
@ -64,6 +54,12 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
private static final Predicate<String> govListTest = Pattern.compile(".*\\.(id|ac|co|org|gov|edu|com)\\.[a-z]{2}").asMatchPredicate();
|
||||
|
||||
public EdgeDomain(@Nonnull String subDomain, @Nonnull String topDomain) {
|
||||
this.subDomain = subDomain;
|
||||
this.topDomain = topDomain;
|
||||
}
|
||||
|
||||
private boolean looksLikeGovTld(String host) {
|
||||
if (host.length() < 8)
|
||||
return false;
|
||||
@ -91,11 +87,11 @@ public class EdgeDomain implements Serializable {
|
||||
}
|
||||
|
||||
|
||||
|
||||
public EdgeUrl toRootUrlHttp() {
|
||||
// Set default protocol to http, as most https websites redirect http->https, but few http websites redirect https->http
|
||||
return new EdgeUrl("http", this, null, "/", null);
|
||||
}
|
||||
|
||||
public EdgeUrl toRootUrlHttps() {
|
||||
return new EdgeUrl("https", this, null, "/", null);
|
||||
}
|
||||
@ -125,8 +121,7 @@ public class EdgeDomain implements Serializable {
|
||||
int cutPoint = topDomain.indexOf('.');
|
||||
if (cutPoint < 0) {
|
||||
ret.append(topDomain);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
ret.append(topDomain, 0, cutPoint);
|
||||
}
|
||||
|
||||
@ -155,16 +150,14 @@ public class EdgeDomain implements Serializable {
|
||||
|
||||
if (govListTest.test(topDomain)) {
|
||||
dot = topDomain.indexOf('.', Math.max(0, length - ".edu.uk".length()));
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
dot = topDomain.lastIndexOf('.');
|
||||
}
|
||||
|
||||
|
||||
if (dot < 0 || dot == topDomain.length() - 1) {
|
||||
return "-";
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return topDomain.substring(dot + 1);
|
||||
}
|
||||
}
|
||||
@ -174,10 +167,10 @@ public class EdgeDomain implements Serializable {
|
||||
if (!(o instanceof EdgeDomain other)) return false;
|
||||
final String this$subDomain = this.getSubDomain();
|
||||
final String other$subDomain = other.getSubDomain();
|
||||
if (!Objects.equals(this$subDomain,other$subDomain)) return false;
|
||||
if (!Objects.equals(this$subDomain, other$subDomain)) return false;
|
||||
final String this$domain = this.getTopDomain();
|
||||
final String other$domain = other.getTopDomain();
|
||||
if (!Objects.equals(this$domain,other$domain)) return false;
|
||||
if (!Objects.equals(this$domain, other$domain)) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -191,4 +184,13 @@ public class EdgeDomain implements Serializable {
|
||||
return result;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getSubDomain() {
|
||||
return this.subDomain;
|
||||
}
|
||||
|
||||
@Nonnull
|
||||
public String getTopDomain() {
|
||||
return this.topDomain;
|
||||
}
|
||||
}
|
||||
|
@ -1,8 +1,5 @@
|
||||
package nu.marginalia.model;
|
||||
|
||||
import lombok.Builder;
|
||||
import lombok.Getter;
|
||||
import lombok.Setter;
|
||||
import nu.marginalia.util.QueryParams;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
@ -15,7 +12,6 @@ import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Getter @Setter @Builder
|
||||
public class EdgeUrl implements Serializable {
|
||||
public final String proto;
|
||||
public final EdgeDomain domain;
|
||||
@ -38,9 +34,8 @@ public class EdgeUrl implements Serializable {
|
||||
private static URI parseURI(String url) throws URISyntaxException {
|
||||
try {
|
||||
return new URI(urlencodeFixer(url));
|
||||
}
|
||||
catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException(STR."Failed to parse URI '\{url}'", ex.getMessage());
|
||||
} catch (URISyntaxException ex) {
|
||||
throw new URISyntaxException("Failed to parse URI '" + url + "'", ex.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
@ -83,20 +78,17 @@ public class EdgeUrl implements Serializable {
|
||||
for (int i = pathIdx; i < end; i++) {
|
||||
int c = url.charAt(i);
|
||||
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <='Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
if (goodChars.indexOf(c) >= 0 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9')) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else if (c == '%' && i+2<end) {
|
||||
int cn = url.charAt(i+1);
|
||||
int cnn = url.charAt(i+2);
|
||||
} else if (c == '%' && i + 2 < end) {
|
||||
int cn = url.charAt(i + 1);
|
||||
int cnn = url.charAt(i + 2);
|
||||
if (hexChars.indexOf(cn) >= 0 && hexChars.indexOf(cnn) >= 0) {
|
||||
s.appendCodePoint(c);
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
s.append("%25");
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
s.append(String.format("%%%02X", c));
|
||||
}
|
||||
}
|
||||
@ -109,7 +101,7 @@ public class EdgeUrl implements Serializable {
|
||||
if (colonIdx < 0 || colonIdx + 2 >= url.length()) {
|
||||
throw new URISyntaxException(url, "Lacking protocol");
|
||||
}
|
||||
return url.indexOf('/', colonIdx+2);
|
||||
return url.indexOf('/', colonIdx + 2);
|
||||
}
|
||||
|
||||
public EdgeUrl(URI URI) {
|
||||
@ -125,8 +117,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URI.getScheme().toLowerCase();
|
||||
this.port = port(URI.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URI.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URI);
|
||||
throw ex;
|
||||
}
|
||||
@ -145,8 +136,7 @@ public class EdgeUrl implements Serializable {
|
||||
this.proto = URL.getProtocol().toLowerCase();
|
||||
this.port = port(URL.getPort(), proto);
|
||||
this.param = QueryParams.queryParamsSanitizer(this.path, URL.getQuery());
|
||||
}
|
||||
catch (Exception ex) {
|
||||
} catch (Exception ex) {
|
||||
System.err.println("Failed to parse " + URL);
|
||||
throw ex;
|
||||
}
|
||||
@ -158,8 +148,7 @@ public class EdgeUrl implements Serializable {
|
||||
}
|
||||
if (protocol.equals("http") && port == 80) {
|
||||
return null;
|
||||
}
|
||||
else if (protocol.equals("https") && port == 443) {
|
||||
} else if (protocol.equals("https") && port == 443) {
|
||||
return null;
|
||||
}
|
||||
return port;
|
||||
@ -190,12 +179,13 @@ public class EdgeUrl implements Serializable {
|
||||
public String dir() {
|
||||
return path.replaceAll("/[^/]+$", "/");
|
||||
}
|
||||
|
||||
public String fileName() {
|
||||
return path.replaceAll(".*/", "");
|
||||
}
|
||||
|
||||
public int depth() {
|
||||
return (int) path.chars().filter(c -> c=='/').count();
|
||||
return (int) path.chars().filter(c -> c == '/').count();
|
||||
}
|
||||
|
||||
public EdgeUrl withPathAndParam(String path, String param) {
|
||||
@ -207,8 +197,8 @@ public class EdgeUrl implements Serializable {
|
||||
if (other == this) return true;
|
||||
if (other instanceof EdgeUrl e) {
|
||||
return Objects.equals(e.domain, domain)
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
&& Objects.equals(e.path, path)
|
||||
&& Objects.equals(e.param, param);
|
||||
}
|
||||
|
||||
return true;
|
||||
@ -235,8 +225,7 @@ public class EdgeUrl implements Serializable {
|
||||
public URL asURL() throws MalformedURLException {
|
||||
try {
|
||||
return asURI().toURL();
|
||||
}
|
||||
catch (URISyntaxException e) {
|
||||
} catch (URISyntaxException e) {
|
||||
throw new MalformedURLException(e.getMessage());
|
||||
}
|
||||
}
|
||||
@ -248,4 +237,9 @@ public class EdgeUrl implements Serializable {
|
||||
|
||||
return new URI(this.proto, this.domain.toString(), this.path, this.param, null);
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,8 +1,8 @@
|
||||
package nu.marginalia.process.log;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
@ -21,32 +21,36 @@ class WorkLoadIterable<T> implements Iterable<T> {
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public Iterator<T> iterator() {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
try {
|
||||
var stream = Files.lines(logFile);
|
||||
return new Iterator<>() {
|
||||
final Iterator<T> iter = stream
|
||||
.filter(WorkLogEntry::isJobId)
|
||||
.map(WorkLogEntry::parse)
|
||||
.map(mapper)
|
||||
.filter(Optional::isPresent)
|
||||
.map(Optional::get)
|
||||
.iterator();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (iter.hasNext()) {
|
||||
return true;
|
||||
} else {
|
||||
stream.close();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
@Override
|
||||
public T next() {
|
||||
return iter.next();
|
||||
}
|
||||
};
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -33,6 +33,6 @@ public record WorkLogEntry(String id, String ts, String path, int cnt) {
|
||||
|
||||
String relPath = fileName();
|
||||
|
||||
return STR."\{relPath.substring(0, 2)}/\{relPath.substring(2, 4)}/\{relPath}";
|
||||
return relPath.substring(0, 2) + "/" + relPath.substring(2, 4) + "/" + relPath;
|
||||
}
|
||||
}
|
||||
|
@ -4,12 +4,12 @@ import com.github.jknack.handlebars.*;
|
||||
import com.github.jknack.handlebars.helper.ConditionalHelpers;
|
||||
import com.github.jknack.handlebars.io.ClassPathTemplateLoader;
|
||||
import com.github.jknack.handlebars.io.TemplateLoader;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
@ -42,22 +42,35 @@ public class MustacheRenderer<T> {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model) {
|
||||
return template.apply(model);
|
||||
try {
|
||||
return template.apply(model);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public <T2> String render(T model, String name, List<T2> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(name, children).build();
|
||||
|
||||
return template.apply(ctx);
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public String render(T model, Map<String, ?> children) {
|
||||
Context ctx = Context.newBuilder(model).combine(children).build();
|
||||
return template.apply(ctx);
|
||||
|
||||
try {
|
||||
return template.apply(ctx);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException("Failed to render template", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -26,7 +25,6 @@ public class NodeConfigurationWatcher {
|
||||
watcherThread.start();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void pollConfiguration() {
|
||||
for (;;) {
|
||||
List<Integer> goodNodes = new ArrayList<>();
|
||||
@ -34,7 +32,7 @@ public class NodeConfigurationWatcher {
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT ID FROM NODE_CONFIGURATION
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
WHERE ACCEPT_QUERIES AND NOT DISABLED
|
||||
""");
|
||||
var rs = stmt.executeQuery();
|
||||
while (rs.next()) {
|
||||
@ -47,7 +45,12 @@ public class NodeConfigurationWatcher {
|
||||
|
||||
queryNodes = goodNodes;
|
||||
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
try {
|
||||
TimeUnit.SECONDS.sleep(10);
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -4,13 +4,13 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import io.grpc.ManagedChannel;
|
||||
import io.grpc.ManagedChannelBuilder;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.util.NamedExecutorFactory;
|
||||
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.function.Function;
|
||||
@ -48,7 +48,12 @@ public class GrpcChannelPoolFactory {
|
||||
public <STUB> GrpcSingleNodeChannelPool<STUB> createSingle(ServiceKey<? extends PartitionTraits.Unicast> key,
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
{
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(serviceRegistryIf, key, this::createChannel, stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private ManagedChannel createChannel(InstanceAddress route) {
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.client;
|
||||
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.NodeConfigurationWatcher;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@ -12,7 +11,10 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
|
||||
@ -29,7 +31,6 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
private final NodeConfigurationWatcher nodeConfigurationWatcher;
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcMultiNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<ServicePartition.Multi> serviceKey,
|
||||
Function<ServiceEndpoint.InstanceAddress, ManagedChannel> channelConstructor,
|
||||
@ -52,11 +53,16 @@ public class GrpcMultiNodeChannelPool<STUB> {
|
||||
}
|
||||
|
||||
private GrpcSingleNodeChannelPool<STUB> newSingleChannelPool(int node) {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
try {
|
||||
return new GrpcSingleNodeChannelPool<>(
|
||||
serviceRegistryIf,
|
||||
serviceKey.forPartition(ServicePartition.partition(node)),
|
||||
channelConstructor,
|
||||
stubConstructor);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Get the list of nodes that are eligible for broadcast-style requests */
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service.client;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import io.grpc.ManagedChannel;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceChangeMonitor;
|
||||
import nu.marginalia.service.discovery.property.PartitionTraits;
|
||||
@ -14,7 +13,9 @@ import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executor;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
@ -32,11 +33,12 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
private final Function<ManagedChannel, STUB> stubConstructor;
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
public GrpcSingleNodeChannelPool(ServiceRegistryIf serviceRegistryIf,
|
||||
ServiceKey<? extends PartitionTraits.Unicast> serviceKey,
|
||||
Function<InstanceAddress, ManagedChannel> channelConstructor,
|
||||
Function<ManagedChannel, STUB> stubConstructor) {
|
||||
Function<ManagedChannel, STUB> stubConstructor)
|
||||
throws Exception
|
||||
{
|
||||
super(serviceKey);
|
||||
|
||||
this.serviceRegistryIf = serviceRegistryIf;
|
||||
@ -112,7 +114,7 @@ public class GrpcSingleNodeChannelPool<STUB> extends ServiceChangeMonitor {
|
||||
}
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error(STR."Failed to get channel for \{address}", e);
|
||||
logger.error("Failed to get channel for " + address, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
@ -4,6 +4,6 @@ import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
|
||||
public class ServiceNotAvailableException extends RuntimeException {
|
||||
public ServiceNotAvailableException(ServiceKey<?> key) {
|
||||
super(STR."Service \{key} not available");
|
||||
super("Service " + key + " not available");
|
||||
}
|
||||
}
|
||||
|
@ -2,11 +2,8 @@ package nu.marginalia.service.discovery;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.discovery.monitor.*;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.*;
|
||||
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import org.apache.curator.framework.CuratorFramework;
|
||||
import org.apache.curator.utils.ZKPaths;
|
||||
@ -16,9 +13,14 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import static nu.marginalia.service.discovery.property.ServiceEndpoint.InstanceAddress;
|
||||
|
||||
/** A versatile service registry that uses ZooKeeper to store service endpoints.
|
||||
* It is used to register services and to look up the endpoints of other services.
|
||||
* <p></p>
|
||||
@ -37,18 +39,22 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
private final List<String> livenessPaths = new ArrayList<>();
|
||||
|
||||
@Inject
|
||||
@SneakyThrows
|
||||
public ZkServiceRegistry(CuratorFramework curatorFramework) {
|
||||
this.curatorFramework = curatorFramework;
|
||||
try {
|
||||
this.curatorFramework = curatorFramework;
|
||||
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
curatorFramework.start();
|
||||
if (!curatorFramework.blockUntilConnected(30, TimeUnit.SECONDS)) {
|
||||
throw new IllegalStateException("Failed to connect to zookeeper after 30s");
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to start ZkServiceRegistry", ex);
|
||||
}
|
||||
|
||||
Runtime.getRuntime().addShutdownHook(
|
||||
new Thread(this::shutDown, "ZkServiceRegistry shutdown hook")
|
||||
);
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -59,8 +65,8 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
{
|
||||
var endpoint = new ServiceEndpoint(externalAddress, requestPort(externalAddress, key));
|
||||
|
||||
String path = STR."\{key.toPath()}/\{instanceUUID.toString()}";
|
||||
byte[] payload = STR."\{endpoint.host()}:\{endpoint.port()}".getBytes(StandardCharsets.UTF_8);
|
||||
String path = key.toPath() + "/" + instanceUUID.toString();
|
||||
byte[] payload = (endpoint.host() + ":" + endpoint.port()).getBytes(StandardCharsets.UTF_8);
|
||||
|
||||
logger.info("Registering {} -> {}", path, endpoint);
|
||||
|
||||
@ -72,14 +78,18 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
return endpoint;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public void declareFirstBoot() {
|
||||
if (!isFirstBoot()) {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
try {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.PERSISTENT)
|
||||
.forPath("/first-boot");
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Failed to declare first-boot", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -109,7 +119,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
@Override
|
||||
public void announceInstance(UUID instanceUUID) {
|
||||
try {
|
||||
String serviceRoot = STR."/running-instances/\{instanceUUID.toString()}";
|
||||
String serviceRoot = "/running-instances/" + instanceUUID.toString();
|
||||
|
||||
livenessPaths.add(serviceRoot);
|
||||
|
||||
@ -128,7 +138,7 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
*/
|
||||
public boolean isInstanceRunning(UUID instanceUUID) {
|
||||
try {
|
||||
String serviceRoot = STR."/running-instances/\{instanceUUID.toString()}";
|
||||
String serviceRoot = "/running-instances/" + instanceUUID.toString();
|
||||
return null != curatorFramework.checkExists().forPath(serviceRoot);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
@ -165,11 +175,11 @@ public class ZkServiceRegistry implements ServiceRegistryIf {
|
||||
curatorFramework.create()
|
||||
.creatingParentsIfNeeded()
|
||||
.withMode(CreateMode.EPHEMERAL)
|
||||
.forPath(STR."/port-registry/\{externalHost}/\{port}", payload);
|
||||
.forPath("/port-registry/" + externalHost + "/" + port, payload);
|
||||
return port;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error(STR."Still negotiating port for \{identifier}");
|
||||
logger.error("Still negotiating port for " + identifier);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
|
||||
record Rest(String name) implements ServiceKey<ServicePartition.None> {
|
||||
public String toPath() {
|
||||
return STR."/services/rest/\{name}";
|
||||
return "/services/rest/" + name;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -51,10 +51,10 @@ public sealed interface ServiceKey<P extends ServicePartition> {
|
||||
}
|
||||
record Grpc<P extends ServicePartition>(String name, P partition) implements ServiceKey<P> {
|
||||
public String baseName() {
|
||||
return STR."/services/grpc/\{name}";
|
||||
return "/services/grpc/" + name;
|
||||
}
|
||||
public String toPath() {
|
||||
return STR."/services/grpc/\{name}/\{partition.identifier()}";
|
||||
return "/services/grpc/" + name + "/" + partition.identifier();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -5,14 +5,12 @@ import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariConfig;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import org.flywaydb.core.Flyway;
|
||||
import org.mariadb.jdbc.Driver;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.sql.DataSource;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -71,14 +69,12 @@ public class DatabaseModule extends AbstractModule {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Singleton
|
||||
@Provides
|
||||
public HikariDataSource provideConnection() {
|
||||
return getMariaDB();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private HikariDataSource getMariaDB() {
|
||||
var connStr = System.getProperty("db.overrideJdbc", dbProperties.getProperty(DB_CONN_KEY));
|
||||
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -59,13 +58,17 @@ public class Initialization {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public boolean waitReady() {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
try {
|
||||
synchronized (this) {
|
||||
while (!initialized) {
|
||||
wait();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
throw new RuntimeException("Interrupted while waiting for initialization", ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.prometheus.client.exporter.MetricsServlet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import org.eclipse.jetty.server.Server;
|
||||
import org.eclipse.jetty.servlet.ServletContextHandler;
|
||||
@ -10,9 +9,8 @@ import org.eclipse.jetty.servlet.ServletHolder;
|
||||
|
||||
public class MetricsServer {
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public MetricsServer(ServiceConfiguration configuration) {
|
||||
public MetricsServer(ServiceConfiguration configuration) throws Exception {
|
||||
// If less than zero, we forego setting up a metrics server
|
||||
if (configuration.metricsPort() < 0)
|
||||
return;
|
||||
|
@ -1,8 +1,7 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import com.google.inject.name.Named;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import com.google.inject.name.Named;
|
||||
import nu.marginalia.mq.persistence.MqPersistence;
|
||||
import nu.marginalia.nodecfg.NodeConfigurationService;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
@ -81,10 +80,14 @@ public class NodeStatusWatcher {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private boolean isConfigured() {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
try {
|
||||
var configuration = configurationService.get(nodeId);
|
||||
return configuration != null;
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
/** Look for changes in the configuration and kill the service if the corresponding
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.prometheus.client.Counter;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
@ -44,11 +43,10 @@ public class Service {
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
@SneakyThrows
|
||||
public Service(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
|
||||
this.initialization = params.initialization;
|
||||
var config = params.configuration;
|
||||
@ -130,14 +128,14 @@ public class Service {
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) {
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
partition,
|
||||
grpcServices);
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params) {
|
||||
public Service(BaseServiceParams params) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
ServicePartition.any(),
|
||||
|
@ -1,20 +1,18 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
import spark.resource.ClassPathResource;
|
||||
import spark.staticfiles.MimeType;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.time.LocalDateTime;
|
||||
import java.time.ZoneOffset;
|
||||
|
||||
public class StaticResources {
|
||||
private final long startTime = LocalDateTime.now().toEpochSecond(ZoneOffset.UTC);
|
||||
|
||||
@SneakyThrows
|
||||
public void serveStatic(String domain, String path, Request req, Response rsp) {
|
||||
try {
|
||||
if (path.startsWith("..") || domain.startsWith("..")) {
|
||||
@ -28,7 +26,7 @@ public class StaticResources {
|
||||
|
||||
resource.getInputStream().transferTo(rsp.raw().getOutputStream());
|
||||
}
|
||||
catch (IllegalArgumentException | FileNotFoundException ex) {
|
||||
catch (IllegalArgumentException | IOException ex) {
|
||||
Spark.halt(404);
|
||||
}
|
||||
}
|
||||
@ -57,7 +55,6 @@ public class StaticResources {
|
||||
return "application/octet-stream";
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void handleEtagStatic(ClassPathResource resource, Request req, Response rsp) {
|
||||
rsp.header("Cache-Control", "public,max-age=3600");
|
||||
rsp.type(MimeType.fromResource(resource));
|
||||
|
@ -24,7 +24,7 @@ public class NamedExecutorFactory {
|
||||
|
||||
@Override
|
||||
public Thread newThread(@NotNull Runnable r) {
|
||||
var thread = new Thread(r, STR."\{name}[\{threadNumber.getAndIncrement()}]");
|
||||
var thread = new Thread(r, name + "[" + threadNumber.getAndIncrement() + "]");
|
||||
thread.setDaemon(true);
|
||||
return thread;
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.service.discovery;
|
||||
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.discovery.monitor.ServiceMonitorIf;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.test.TestApiGrpc;
|
||||
import org.apache.curator.framework.CuratorFrameworkFactory;
|
||||
import org.apache.curator.retry.ExponentialBackoffRetry;
|
||||
@ -33,7 +33,7 @@ class ZkServiceRegistryTest {
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
zookeeper.start();
|
||||
connectString = STR."\{zookeeper.getHost()}:\{zookeeper.getMappedPort(ZOOKEEPER_PORT)}";
|
||||
connectString = zookeeper.getHost() + ":" + zookeeper.getMappedPort(ZOOKEEPER_PORT);
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import gnu.trove.set.hash.TLongHashSet;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
@ -101,10 +100,14 @@ public class AtagExporter implements ExporterIf {
|
||||
continue;
|
||||
}
|
||||
|
||||
var linkOpt = linkParser.parseLinkPermissive(baseUrl, atag);
|
||||
linkOpt
|
||||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText))
|
||||
.ifPresent(url -> exporter.accept(url, baseUrl.domain, linkText));
|
||||
var linkOpt = linkParser
|
||||
.parseLinkPermissive(baseUrl, atag)
|
||||
.filter(url -> linkFilter.isEligible(url, baseUrl, linkText));
|
||||
|
||||
if (linkOpt.isPresent()) {
|
||||
var url = linkOpt.get();
|
||||
exporter.accept(url, baseUrl.domain, linkText);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -167,8 +170,7 @@ public class AtagExporter implements ExporterIf {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) {
|
||||
public void accept(EdgeUrl url, EdgeDomain sourceDomain, String linkText) throws IOException {
|
||||
final String urlString = urlWithNoSchema(url);
|
||||
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.extractor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.io.CrawledDomainReader;
|
||||
import nu.marginalia.io.SerializableCrawlDataStream;
|
||||
import nu.marginalia.link_parser.FeedExtractor;
|
||||
@ -115,12 +114,16 @@ public class FeedExporter implements ExporterIf {
|
||||
this.writer = writer;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void accept(EdgeDomain domain, int size, EdgeUrl path) {
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(domain),
|
||||
csvify(size),
|
||||
csvify(path)));
|
||||
try {
|
||||
writer.write(String.format("\"%s\",\"%s\",\"%s\"\n",
|
||||
csvify(domain),
|
||||
csvify(size),
|
||||
csvify(path)));
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static String csvify(Object field) {
|
||||
|
@ -6,6 +6,8 @@ import nu.marginalia.process.log.WorkLogEntry;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorage;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -18,9 +20,6 @@ import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.compress.utils.IOUtils;
|
||||
|
||||
public class SampleDataExporter {
|
||||
private final FileStorageService storageService;
|
||||
|
||||
@ -57,16 +56,13 @@ public class SampleDataExporter {
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
try (var bw = Files.newBufferedWriter(newCrawlerLogFile, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING)) {
|
||||
for (var item : entriesAll) {
|
||||
bw.write(STR."\{item.id()} \{item.ts()} \{item.relPath()} \{item.cnt()}\n");
|
||||
bw.write(item.id() + " " + item.ts() + " " + item.relPath() + " " + item.cnt() + "\n");
|
||||
}
|
||||
}
|
||||
|
||||
Path newManifestJsonFile = Files.createTempFile(destStorage.asPath(), "manifest", ".json",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
Files.writeString(newManifestJsonFile, STR."""
|
||||
{ "description": "\{name.replace("[\"\\]", "_")}",
|
||||
"type": "CRAWL_DATA" }
|
||||
""");
|
||||
Files.writeString(newManifestJsonFile, " { \"description\": \"" + name.replace("[\"\\]", "_") + "\",\n \"type\": \"CRAWL_DATA\" }\n");
|
||||
|
||||
var tmpTarFile = Files.createTempFile(destStorage.asPath(), "data", ".tar",
|
||||
PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.actor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.actor.monitor.FileStorageMonitorActor;
|
||||
import nu.marginalia.actor.proc.*;
|
||||
import nu.marginalia.actor.prototype.ActorPrototype;
|
||||
@ -13,6 +12,8 @@ import nu.marginalia.actor.task.*;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
@ -27,6 +28,8 @@ public class ExecutorActorControlService {
|
||||
public Map<ExecutorActor, ActorPrototype> actorDefinitions = new HashMap<>();
|
||||
private final int node;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public ExecutorActorControlService(MessageQueueFactory messageQueueFactory,
|
||||
BaseServiceParams baseServiceParams,
|
||||
@ -119,11 +122,15 @@ public class ExecutorActorControlService {
|
||||
stateMachines.startFromJSON(process, state, json);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void stop(ExecutorActor process) {
|
||||
eventLog.logEvent("FSM-STOP", process.id());
|
||||
|
||||
stateMachines.stop(process);
|
||||
try {
|
||||
stateMachines.stop(process);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to stop FSM", e);
|
||||
}
|
||||
}
|
||||
|
||||
public Map<ExecutorActor, ActorStateInstance> getActorStates() {
|
||||
|
@ -8,6 +8,9 @@ import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStep;
|
||||
import nu.marginalia.actor.state.Resume;
|
||||
import nu.marginalia.encyclopedia.EncyclopediaConverter;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import nu.marginalia.process.ProcessOutboxes;
|
||||
import nu.marginalia.process.ProcessService;
|
||||
import nu.marginalia.sideload.RedditSideloadHelper;
|
||||
@ -17,9 +20,6 @@ import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageId;
|
||||
import nu.marginalia.storage.model.FileStorageState;
|
||||
import nu.marginalia.storage.model.FileStorageType;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.mqapi.converting.ConvertRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -140,7 +140,7 @@ public class ConvertActor extends RecordActorPrototype {
|
||||
// To avoid re-converting the same file, we'll assign the file a name based on its hash
|
||||
// and the original filename. This way, if we're fed the same file again, we'll be able to just
|
||||
// re-use the predigested database file.
|
||||
yield new PredigestEncyclopedia(source, STR."\{source}.\{hash}.db", baseUrl);
|
||||
yield new PredigestEncyclopedia(source, source + "." + hash + ".db", baseUrl);
|
||||
} else if (!source.endsWith(".db")) {
|
||||
yield new Error("Source path must be a ZIM or pre-digested sqlite database file (.db)");
|
||||
}
|
||||
|
@ -3,9 +3,6 @@ package nu.marginalia.actor.task;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.With;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.actor.prototype.RecordActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
@ -40,7 +37,6 @@ import java.util.List;
|
||||
public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
|
||||
// STATES
|
||||
|
||||
public static final String RERANK = "RERANK";
|
||||
private final ActorProcessWatcher processWatcher;
|
||||
private final MqOutbox mqConverterOutbox;
|
||||
@ -54,15 +50,6 @@ public class ConvertAndLoadActor extends RecordActorPrototype {
|
||||
private final int nodeId;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
|
||||
@AllArgsConstructor @With @NoArgsConstructor
|
||||
public static class Message {
|
||||
public FileStorageId crawlStorageId = null;
|
||||
public List<FileStorageId> processedStorageId = null;
|
||||
public long converterMsgId = 0L;
|
||||
public long loaderMsgId = 0L;
|
||||
}
|
||||
|
||||
public record Initial(FileStorageId fid) implements ActorStep {}
|
||||
|
||||
@Resume(behavior = ActorResumeBehavior.RETRY)
|
||||
|
@ -29,7 +29,7 @@ public class ExportSampleDataActor extends RecordActorPrototype {
|
||||
case Export(FileStorageId crawlId, int size, String name) -> {
|
||||
var storage = storageService.allocateStorage(FileStorageType.EXPORT,
|
||||
"crawl-sample-export",
|
||||
STR."Crawl Data Sample \{name}/\{size} \{LocalDateTime.now()}"
|
||||
"Crawl Data Sample " + name + "/" + size + " " + LocalDateTime.now()
|
||||
);
|
||||
|
||||
if (storage == null) yield new Error("Bad storage id");
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.execution;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.grpc.stub.StreamObserver;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.actor.ActorApi;
|
||||
import nu.marginalia.actor.ExecutorActor;
|
||||
@ -228,13 +227,17 @@ public class ExecutorGrpcService
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private RpcFileStorageEntry createFileModel(Path path) {
|
||||
return RpcFileStorageEntry.newBuilder()
|
||||
.setName(path.toFile().getName())
|
||||
.setSize(Files.size(path))
|
||||
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
|
||||
.build();
|
||||
try {
|
||||
return RpcFileStorageEntry.newBuilder()
|
||||
.setName(path.toFile().getName())
|
||||
.setSize(Files.size(path))
|
||||
.setLastModifiedTime(Files.getLastModifiedTime(path).toInstant().toString())
|
||||
.build();
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -9,7 +9,8 @@ import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardCopyOption;
|
||||
import java.util.*;
|
||||
import java.util.Set;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.ForkJoinPool;
|
||||
import java.util.stream.Collectors;
|
||||
@ -97,7 +98,7 @@ public class RedditSideloadHelper {
|
||||
|
||||
private static Path getRedditDbPath(RedditFilePair pair) throws IOException {
|
||||
String hash = SideloadHelper.getCrc32FileHash(pair.commentsPath());
|
||||
return pair.rootDir().resolve(STR."\{pair.fileNameBase}.\{hash}.db");
|
||||
return pair.rootDir().resolve(pair.fileNameBase + "." + hash + ".db");
|
||||
}
|
||||
|
||||
}
|
@ -83,7 +83,7 @@ public class StackExchangeSideloadHelper {
|
||||
String fileName = sourcePath.toFile().getName();
|
||||
String hash = SideloadHelper.getCrc32FileHash(sourcePath);
|
||||
|
||||
return sourcePath.getParent().resolve(STR."\{fileName}.\{hash}.db");
|
||||
return sourcePath.getParent().resolve(fileName + "." + hash + ".db");
|
||||
}
|
||||
|
||||
private static Optional<String> getStackexchangeDomainFromFilename(String fileName) {
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.screenshot;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
@ -11,6 +10,7 @@ import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
|
||||
import static java.lang.Integer.parseInt;
|
||||
@ -48,7 +48,6 @@ public class ScreenshotService {
|
||||
return false;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public Object serveScreenshotRequest(Request request, Response response) {
|
||||
if (Strings.isNullOrEmpty(request.params("id"))) {
|
||||
response.redirect("https://search.marginalia.nu/");
|
||||
@ -75,6 +74,9 @@ public class ScreenshotService {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.warn("IO error", ex);
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("SQL error", ex);
|
||||
}
|
||||
|
@ -1,9 +1,9 @@
|
||||
package nu.marginalia.api.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.api.domains.model.*;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
@ -28,18 +28,22 @@ public class DomainsProtobufCodec {
|
||||
return ret;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static SimilarDomain convertResponseEntry(RpcSimilarDomain sd) {
|
||||
return new SimilarDomain(
|
||||
new EdgeUrl(sd.getUrl()),
|
||||
sd.getDomainId(),
|
||||
sd.getRelatedness(),
|
||||
sd.getRank(),
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
try {
|
||||
return new SimilarDomain(
|
||||
new EdgeUrl(sd.getUrl()),
|
||||
sd.getDomainId(),
|
||||
sd.getRelatedness(),
|
||||
sd.getRank(),
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,10 +1,7 @@
|
||||
package nu.marginalia.api.domains.model;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
|
||||
@Getter @AllArgsConstructor @NoArgsConstructor @Builder
|
||||
@ToString
|
||||
public class DomainInformation {
|
||||
EdgeDomain domain;
|
||||
|
||||
@ -29,6 +26,34 @@ public class DomainInformation {
|
||||
String ipCountry;
|
||||
String state;
|
||||
|
||||
public DomainInformation(EdgeDomain domain, boolean blacklisted, int pagesKnown, int pagesFetched, int pagesIndexed, int incomingLinks, int outboundLinks, int nodeAffinity, double ranking, boolean suggestForCrawling, boolean inCrawlQueue, boolean unknownDomain, String ip, Integer asn, String asnOrg, String asnCountry, String ipCountry, String state) {
|
||||
this.domain = domain;
|
||||
this.blacklisted = blacklisted;
|
||||
this.pagesKnown = pagesKnown;
|
||||
this.pagesFetched = pagesFetched;
|
||||
this.pagesIndexed = pagesIndexed;
|
||||
this.incomingLinks = incomingLinks;
|
||||
this.outboundLinks = outboundLinks;
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
this.ranking = ranking;
|
||||
this.suggestForCrawling = suggestForCrawling;
|
||||
this.inCrawlQueue = inCrawlQueue;
|
||||
this.unknownDomain = unknownDomain;
|
||||
this.ip = ip;
|
||||
this.asn = asn;
|
||||
this.asnOrg = asnOrg;
|
||||
this.asnCountry = asnCountry;
|
||||
this.ipCountry = ipCountry;
|
||||
this.state = state;
|
||||
}
|
||||
|
||||
public DomainInformation() {
|
||||
}
|
||||
|
||||
public static DomainInformationBuilder builder() {
|
||||
return new DomainInformationBuilder();
|
||||
}
|
||||
|
||||
public String getIpFlag() {
|
||||
if (ipCountry == null || ipCountry.codePointCount(0, ipCountry.length()) != 2) {
|
||||
return "";
|
||||
@ -45,4 +70,202 @@ public class DomainInformation {
|
||||
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
||||
public boolean isBlacklisted() {
|
||||
return this.blacklisted;
|
||||
}
|
||||
|
||||
public int getPagesKnown() {
|
||||
return this.pagesKnown;
|
||||
}
|
||||
|
||||
public int getPagesFetched() {
|
||||
return this.pagesFetched;
|
||||
}
|
||||
|
||||
public int getPagesIndexed() {
|
||||
return this.pagesIndexed;
|
||||
}
|
||||
|
||||
public int getIncomingLinks() {
|
||||
return this.incomingLinks;
|
||||
}
|
||||
|
||||
public int getOutboundLinks() {
|
||||
return this.outboundLinks;
|
||||
}
|
||||
|
||||
public int getNodeAffinity() {
|
||||
return this.nodeAffinity;
|
||||
}
|
||||
|
||||
public double getRanking() {
|
||||
return this.ranking;
|
||||
}
|
||||
|
||||
public boolean isSuggestForCrawling() {
|
||||
return this.suggestForCrawling;
|
||||
}
|
||||
|
||||
public boolean isInCrawlQueue() {
|
||||
return this.inCrawlQueue;
|
||||
}
|
||||
|
||||
public boolean isUnknownDomain() {
|
||||
return this.unknownDomain;
|
||||
}
|
||||
|
||||
public String getIp() {
|
||||
return this.ip;
|
||||
}
|
||||
|
||||
public Integer getAsn() {
|
||||
return this.asn;
|
||||
}
|
||||
|
||||
public String getAsnOrg() {
|
||||
return this.asnOrg;
|
||||
}
|
||||
|
||||
public String getAsnCountry() {
|
||||
return this.asnCountry;
|
||||
}
|
||||
|
||||
public String getIpCountry() {
|
||||
return this.ipCountry;
|
||||
}
|
||||
|
||||
public String getState() {
|
||||
return this.state;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DomainInformation(domain=" + this.getDomain() + ", blacklisted=" + this.isBlacklisted() + ", pagesKnown=" + this.getPagesKnown() + ", pagesFetched=" + this.getPagesFetched() + ", pagesIndexed=" + this.getPagesIndexed() + ", incomingLinks=" + this.getIncomingLinks() + ", outboundLinks=" + this.getOutboundLinks() + ", nodeAffinity=" + this.getNodeAffinity() + ", ranking=" + this.getRanking() + ", suggestForCrawling=" + this.isSuggestForCrawling() + ", inCrawlQueue=" + this.isInCrawlQueue() + ", unknownDomain=" + this.isUnknownDomain() + ", ip=" + this.getIp() + ", asn=" + this.getAsn() + ", asnOrg=" + this.getAsnOrg() + ", asnCountry=" + this.getAsnCountry() + ", ipCountry=" + this.getIpCountry() + ", state=" + this.getState() + ")";
|
||||
}
|
||||
|
||||
public static class DomainInformationBuilder {
|
||||
private EdgeDomain domain;
|
||||
private boolean blacklisted;
|
||||
private int pagesKnown;
|
||||
private int pagesFetched;
|
||||
private int pagesIndexed;
|
||||
private int incomingLinks;
|
||||
private int outboundLinks;
|
||||
private int nodeAffinity;
|
||||
private double ranking;
|
||||
private boolean suggestForCrawling;
|
||||
private boolean inCrawlQueue;
|
||||
private boolean unknownDomain;
|
||||
private String ip;
|
||||
private Integer asn;
|
||||
private String asnOrg;
|
||||
private String asnCountry;
|
||||
private String ipCountry;
|
||||
private String state;
|
||||
|
||||
DomainInformationBuilder() {
|
||||
}
|
||||
|
||||
public DomainInformationBuilder domain(EdgeDomain domain) {
|
||||
this.domain = domain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder blacklisted(boolean blacklisted) {
|
||||
this.blacklisted = blacklisted;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesKnown(int pagesKnown) {
|
||||
this.pagesKnown = pagesKnown;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesFetched(int pagesFetched) {
|
||||
this.pagesFetched = pagesFetched;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder pagesIndexed(int pagesIndexed) {
|
||||
this.pagesIndexed = pagesIndexed;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder incomingLinks(int incomingLinks) {
|
||||
this.incomingLinks = incomingLinks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder outboundLinks(int outboundLinks) {
|
||||
this.outboundLinks = outboundLinks;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder nodeAffinity(int nodeAffinity) {
|
||||
this.nodeAffinity = nodeAffinity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ranking(double ranking) {
|
||||
this.ranking = ranking;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder suggestForCrawling(boolean suggestForCrawling) {
|
||||
this.suggestForCrawling = suggestForCrawling;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder inCrawlQueue(boolean inCrawlQueue) {
|
||||
this.inCrawlQueue = inCrawlQueue;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder unknownDomain(boolean unknownDomain) {
|
||||
this.unknownDomain = unknownDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ip(String ip) {
|
||||
this.ip = ip;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asn(Integer asn) {
|
||||
this.asn = asn;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asnOrg(String asnOrg) {
|
||||
this.asnOrg = asnOrg;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder asnCountry(String asnCountry) {
|
||||
this.asnCountry = asnCountry;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder ipCountry(String ipCountry) {
|
||||
this.ipCountry = ipCountry;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformationBuilder state(String state) {
|
||||
this.state = state;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DomainInformation build() {
|
||||
return new DomainInformation(this.domain, this.blacklisted, this.pagesKnown, this.pagesFetched, this.pagesIndexed, this.incomingLinks, this.outboundLinks, this.nodeAffinity, this.ranking, this.suggestForCrawling, this.inCrawlQueue, this.unknownDomain, this.ip, this.asn, this.asnOrg, this.asnCountry, this.ipCountry, this.state);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DomainInformation.DomainInformationBuilder(domain=" + this.domain + ", blacklisted=" + this.blacklisted + ", pagesKnown=" + this.pagesKnown + ", pagesFetched=" + this.pagesFetched + ", pagesIndexed=" + this.pagesIndexed + ", incomingLinks=" + this.incomingLinks + ", outboundLinks=" + this.outboundLinks + ", nodeAffinity=" + this.nodeAffinity + ", ranking=" + this.ranking + ", suggestForCrawling=" + this.suggestForCrawling + ", inCrawlQueue=" + this.inCrawlQueue + ", unknownDomain=" + this.unknownDomain + ", ip=" + this.ip + ", asn=" + this.asn + ", asnOrg=" + this.asnOrg + ", asnCountry=" + this.asnCountry + ", ipCountry=" + this.ipCountry + ", state=" + this.state + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -56,10 +56,7 @@ public class DomainInformationService {
|
||||
|
||||
ResultSet rs;
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK
|
||||
FROM EC_DOMAIN WHERE ID=\{domainId}
|
||||
""");
|
||||
rs = stmt.executeQuery("SELECT IP, NODE_AFFINITY, DOMAIN_NAME, STATE, IFNULL(RANK, 1) AS RANK\n FROM EC_DOMAIN WHERE ID=" + domainId + "\n ");
|
||||
if (rs.next()) {
|
||||
String ip = rs.getString("IP");
|
||||
|
||||
@ -77,20 +74,14 @@ public class DomainInformationService {
|
||||
builder.setState(rs.getString("STATE"));
|
||||
builder.setRanking(Math.round(100.0*(1.0-rs.getDouble("RANK"))));
|
||||
}
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT 1 FROM CRAWL_QUEUE
|
||||
INNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME
|
||||
WHERE EC_DOMAIN.ID=\{domainId}
|
||||
""");
|
||||
rs = stmt.executeQuery("SELECT 1 FROM CRAWL_QUEUE\nINNER JOIN EC_DOMAIN ON CRAWL_QUEUE.DOMAIN_NAME = EC_DOMAIN.DOMAIN_NAME\nWHERE EC_DOMAIN.ID=" + domainId + "\n ");
|
||||
inCrawlQueue = rs.next();
|
||||
builder.setInCrawlQueue(inCrawlQueue);
|
||||
|
||||
builder.setIncomingLinks(linkGraphClient.countLinksToDomain(domainId));
|
||||
builder.setOutboundLinks(linkGraphClient.countLinksFromDomain(domainId));
|
||||
|
||||
rs = stmt.executeQuery(STR."""
|
||||
SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=\{domainId}
|
||||
""");
|
||||
rs = stmt.executeQuery("SELECT KNOWN_URLS, GOOD_URLS, VISITED_URLS FROM DOMAIN_METADATA WHERE ID=" + domainId + "\n ");
|
||||
if (rs.next()) {
|
||||
pagesVisited = rs.getInt("VISITED_URLS");
|
||||
|
||||
|
@ -1,14 +1,3 @@
|
||||
package nu.marginalia.api.math.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
|
||||
@AllArgsConstructor
|
||||
@Getter
|
||||
@ToString
|
||||
public class DictionaryEntry {
|
||||
public final String type;
|
||||
public final String word;
|
||||
public final String definition;
|
||||
}
|
||||
public record DictionaryEntry(String type, String word, String definition) { }
|
||||
|
@ -1,14 +1,10 @@
|
||||
package nu.marginalia.api.math.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.ToString;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ToString @Getter @AllArgsConstructor @NoArgsConstructor
|
||||
public class DictionaryResponse {
|
||||
public String word;
|
||||
public List<DictionaryEntry> entries;
|
||||
public record DictionaryResponse(String word, List<DictionaryEntry> entries) {
|
||||
public DictionaryResponse(String word, List<DictionaryEntry> entries) {
|
||||
this.word = word;
|
||||
this.entries = entries.stream().toList(); // Make an immutable copy
|
||||
}
|
||||
}
|
||||
|
@ -39,12 +39,12 @@ public class MathGrpcService
|
||||
.newBuilder()
|
||||
.setWord(request.getWord());
|
||||
|
||||
for (var def : definition.entries) {
|
||||
for (var def : definition.entries()) {
|
||||
responseBuilder.addEntries(
|
||||
RpcDictionaryEntry.newBuilder()
|
||||
.setWord(def.word)
|
||||
.setDefinition(def.definition)
|
||||
.setType(def.type)
|
||||
.setWord(def.word())
|
||||
.setDefinition(def.definition())
|
||||
.setType(def.type())
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -8,7 +8,8 @@ import nu.marginalia.api.math.model.DictionaryResponse;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@Singleton
|
||||
public class DictionaryService {
|
||||
@ -25,8 +26,7 @@ public class DictionaryService {
|
||||
}
|
||||
|
||||
public DictionaryResponse define(String word) {
|
||||
DictionaryResponse response = new DictionaryResponse();
|
||||
response.entries = new ArrayList<>();
|
||||
List<DictionaryEntry> entries = new ArrayList<>();
|
||||
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
var stmt = connection.prepareStatement("SELECT TYPE,WORD,DEFINITION FROM REF_DICTIONARY WHERE WORD=?");
|
||||
@ -34,14 +34,14 @@ public class DictionaryService {
|
||||
|
||||
var rsp = stmt.executeQuery();
|
||||
while (rsp.next()) {
|
||||
response.entries.add(new DictionaryEntry(rsp.getString(1), rsp.getString(2), rsp.getString(3)));
|
||||
entries.add(new DictionaryEntry(rsp.getString(1), rsp.getString(2), rsp.getString(3)));
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return response;
|
||||
return new DictionaryResponse(word, entries);
|
||||
}
|
||||
|
||||
public List<String> spellCheck(String word) {
|
||||
|
@ -1,10 +1,7 @@
|
||||
package nu.marginalia.functions.math.eval;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.SneakyThrows;
|
||||
import lombok.ToString;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import java.math.RoundingMode;
|
||||
import java.text.DecimalFormat;
|
||||
import java.text.NumberFormat;
|
||||
@ -44,7 +41,6 @@ public class MathParser {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public double eval(String inputExpression) throws ParseException {
|
||||
if (isTrivial.test(inputExpression)) {
|
||||
return Double.parseDouble(inputExpression);
|
||||
@ -243,10 +239,13 @@ public class MathParser {
|
||||
}
|
||||
}
|
||||
|
||||
@AllArgsConstructor @ToString
|
||||
class Token {
|
||||
public final char tokenType;
|
||||
|
||||
public Token(char tokenType) {
|
||||
this.tokenType = tokenType;
|
||||
}
|
||||
|
||||
public double evaluate() {
|
||||
throw new IllegalArgumentException("Can't evaluate" + this);
|
||||
}
|
||||
@ -254,9 +253,12 @@ class Token {
|
||||
public void transform(Function<List<Token>, List<Token>> mapper) {
|
||||
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "Token(tokenType=" + this.tokenType + ")";
|
||||
}
|
||||
}
|
||||
|
||||
@ToString
|
||||
class StringToken extends Token {
|
||||
public final String value;
|
||||
|
||||
@ -274,6 +276,10 @@ class StringToken extends Token {
|
||||
|
||||
return Double.parseDouble(value);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "StringToken(value=" + this.value + ")";
|
||||
}
|
||||
}
|
||||
|
||||
class UniExpression extends Token {
|
||||
@ -302,7 +308,6 @@ class UniExpression extends Token {
|
||||
}
|
||||
}
|
||||
|
||||
@ToString
|
||||
class GroupExpression extends Token {
|
||||
public List<Token> argument;
|
||||
|
||||
@ -323,6 +328,10 @@ class GroupExpression extends Token {
|
||||
public void transform(Function<List<Token>, List<Token>> mapper) {
|
||||
argument = mapper.apply(argument);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "GroupExpression(argument=" + this.argument + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.functions.math.eval;
|
||||
|
||||
import com.opencsv.CSVReader;
|
||||
import lombok.SneakyThrows;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.opencsv.CSVReader;
|
||||
|
||||
import java.io.InputStreamReader;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.DecimalFormat;
|
||||
@ -19,7 +18,6 @@ public class Units {
|
||||
private final Map<String, Unit> unitsByName = new HashMap<>();
|
||||
private final MathParser mathParser;
|
||||
|
||||
@SneakyThrows
|
||||
@Inject
|
||||
public Units(MathParser mathParser) {
|
||||
this.mathParser = mathParser;
|
||||
@ -41,6 +39,9 @@ public class Units {
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.api.searchquery;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.model.query.ProcessedQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
@ -128,24 +127,28 @@ public class QueryProtobufCodec {
|
||||
);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static DecoratedSearchResultItem convertDecoratedResult(RpcDecoratedResultItem results) {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(results.getRawItem()),
|
||||
new EdgeUrl(results.getUrl()),
|
||||
results.getTitle(),
|
||||
results.getDescription(),
|
||||
results.getUrlQuality(),
|
||||
results.getFormat(),
|
||||
results.getFeatures(),
|
||||
results.getPubYear(), // ??,
|
||||
results.getDataHash(),
|
||||
results.getWordsTotal(),
|
||||
results.getBestPositions(),
|
||||
results.getRankingScore(),
|
||||
results.getResultsFromDomain(),
|
||||
convertRankingDetails(results.getRankingDetails())
|
||||
);
|
||||
try {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(results.getRawItem()),
|
||||
new EdgeUrl(results.getUrl()),
|
||||
results.getTitle(),
|
||||
results.getDescription(),
|
||||
results.getUrlQuality(),
|
||||
results.getFormat(),
|
||||
results.getFeatures(),
|
||||
results.getPubYear(), // ??,
|
||||
results.getDataHash(),
|
||||
results.getWordsTotal(),
|
||||
results.getBestPositions(),
|
||||
results.getRankingScore(),
|
||||
results.getResultsFromDomain(),
|
||||
convertRankingDetails(results.getRankingDetails())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to convert result", ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) {
|
||||
@ -325,24 +328,28 @@ public class QueryProtobufCodec {
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public static DecoratedSearchResultItem convertQueryResult(RpcDecoratedResultItem rpcDecoratedResultItem) {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(rpcDecoratedResultItem.getRawItem()),
|
||||
new EdgeUrl(rpcDecoratedResultItem.getUrl()),
|
||||
rpcDecoratedResultItem.getTitle(),
|
||||
rpcDecoratedResultItem.getDescription(),
|
||||
rpcDecoratedResultItem.getUrlQuality(),
|
||||
rpcDecoratedResultItem.getFormat(),
|
||||
rpcDecoratedResultItem.getFeatures(),
|
||||
rpcDecoratedResultItem.getPubYear(),
|
||||
rpcDecoratedResultItem.getDataHash(),
|
||||
rpcDecoratedResultItem.getWordsTotal(),
|
||||
rpcDecoratedResultItem.getBestPositions(),
|
||||
rpcDecoratedResultItem.getRankingScore(),
|
||||
rpcDecoratedResultItem.getResultsFromDomain(),
|
||||
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
|
||||
);
|
||||
try {
|
||||
return new DecoratedSearchResultItem(
|
||||
convertRawResult(rpcDecoratedResultItem.getRawItem()),
|
||||
new EdgeUrl(rpcDecoratedResultItem.getUrl()),
|
||||
rpcDecoratedResultItem.getTitle(),
|
||||
rpcDecoratedResultItem.getDescription(),
|
||||
rpcDecoratedResultItem.getUrlQuality(),
|
||||
rpcDecoratedResultItem.getFormat(),
|
||||
rpcDecoratedResultItem.getFeatures(),
|
||||
rpcDecoratedResultItem.getPubYear(),
|
||||
rpcDecoratedResultItem.getDataHash(),
|
||||
rpcDecoratedResultItem.getWordsTotal(),
|
||||
rpcDecoratedResultItem.getBestPositions(),
|
||||
rpcDecoratedResultItem.getRankingScore(),
|
||||
rpcDecoratedResultItem.getResultsFromDomain(),
|
||||
convertRankingDetails(rpcDecoratedResultItem.getRankingDetails())
|
||||
);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException("Failed to convert result", ex);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,40 +1,49 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.Getter;
|
||||
import lombok.With;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@Getter
|
||||
@AllArgsConstructor
|
||||
@With
|
||||
@EqualsAndHashCode
|
||||
public class SearchQuery {
|
||||
|
||||
/** An infix style expression that encodes the required terms in the query */
|
||||
/**
|
||||
* An infix style expression that encodes the required terms in the query
|
||||
*/
|
||||
public final String compiledQuery;
|
||||
|
||||
/** All terms that appear in {@see compiledQuery} */
|
||||
/**
|
||||
* All terms that appear in {@see compiledQuery}
|
||||
*/
|
||||
public final List<String> searchTermsInclude;
|
||||
|
||||
/** These terms must be absent from the document */
|
||||
/**
|
||||
* These terms must be absent from the document
|
||||
*/
|
||||
public final List<String> searchTermsExclude;
|
||||
|
||||
/** These terms must be present in the document, but are not used in ranking */
|
||||
/**
|
||||
* These terms must be present in the document, but are not used in ranking
|
||||
*/
|
||||
public final List<String> searchTermsAdvice;
|
||||
|
||||
/** If these optional terms are present in the document, rank it highly */
|
||||
/**
|
||||
* If these optional terms are present in the document, rank it highly
|
||||
*/
|
||||
public final List<String> searchTermsPriority;
|
||||
|
||||
/** Terms that we require to be in the same sentence */
|
||||
/**
|
||||
* Terms that we require to be in the same sentence
|
||||
*/
|
||||
public final List<SearchPhraseConstraint> phraseConstraints;
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
private double value = 0;
|
||||
public SearchQuery(String compiledQuery, List<String> searchTermsInclude, List<String> searchTermsExclude, List<String> searchTermsAdvice, List<String> searchTermsPriority, List<SearchPhraseConstraint> phraseConstraints) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
this.searchTermsPriority = searchTermsPriority;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
}
|
||||
|
||||
public static SearchQueryBuilder builder() {
|
||||
return new SearchQueryBuilder();
|
||||
@ -49,42 +58,70 @@ public class SearchQuery {
|
||||
this.phraseConstraints = new ArrayList<>();
|
||||
}
|
||||
|
||||
public SearchQuery(String compiledQuery,
|
||||
List<String> searchTermsInclude,
|
||||
List<String> searchTermsExclude,
|
||||
List<String> searchTermsAdvice,
|
||||
List<String> searchTermsPriority,
|
||||
List<SearchPhraseConstraint> phraseConstraints) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
this.searchTermsInclude = searchTermsInclude;
|
||||
this.searchTermsExclude = searchTermsExclude;
|
||||
this.searchTermsAdvice = searchTermsAdvice;
|
||||
this.searchTermsPriority = searchTermsPriority;
|
||||
this.phraseConstraints = phraseConstraints;
|
||||
}
|
||||
|
||||
@Deprecated // why does this exist?
|
||||
public SearchQuery setValue(double value) {
|
||||
if (Double.isInfinite(value) || Double.isNaN(value)) {
|
||||
this.value = Double.MAX_VALUE;
|
||||
} else {
|
||||
this.value = value;
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", ");
|
||||
if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
if (!searchTermsExclude.isEmpty())
|
||||
sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsAdvice.isEmpty())
|
||||
sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!searchTermsPriority.isEmpty())
|
||||
sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] ")));
|
||||
if (!phraseConstraints.isEmpty())
|
||||
sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh -> coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", ")));
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public String getCompiledQuery() {
|
||||
return this.compiledQuery;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsInclude() {
|
||||
return this.searchTermsInclude;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsExclude() {
|
||||
return this.searchTermsExclude;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsAdvice() {
|
||||
return this.searchTermsAdvice;
|
||||
}
|
||||
|
||||
public List<String> getSearchTermsPriority() {
|
||||
return this.searchTermsPriority;
|
||||
}
|
||||
|
||||
public List<SearchPhraseConstraint> getPhraseConstraints() {
|
||||
return this.phraseConstraints;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public final boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof SearchQuery that)) return false;
|
||||
|
||||
return Objects.equals(compiledQuery, that.compiledQuery)
|
||||
&& Objects.equals(searchTermsInclude, that.searchTermsInclude)
|
||||
&& Objects.equals(searchTermsExclude, that.searchTermsExclude)
|
||||
&& Objects.equals(searchTermsAdvice, that.searchTermsAdvice)
|
||||
&& Objects.equals(searchTermsPriority, that.searchTermsPriority)
|
||||
&& Objects.equals(phraseConstraints, that.phraseConstraints);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(compiledQuery,
|
||||
searchTermsInclude,
|
||||
searchTermsExclude,
|
||||
searchTermsAdvice,
|
||||
searchTermsPriority,
|
||||
phraseConstraints);
|
||||
}
|
||||
|
||||
public static class SearchQueryBuilder {
|
||||
private String compiledQuery;
|
||||
public final List<String> searchTermsInclude = new ArrayList<>();
|
||||
@ -130,7 +167,9 @@ public class SearchQuery {
|
||||
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints);
|
||||
}
|
||||
|
||||
/** If there are no ranking terms, promote the advice terms to ranking terms */
|
||||
/**
|
||||
* If there are no ranking terms, promote the advice terms to ranking terms
|
||||
*/
|
||||
public void promoteNonRankingTerms() {
|
||||
if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) {
|
||||
searchTermsInclude.addAll(searchTermsAdvice);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.query;
|
||||
|
||||
import lombok.*;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
@ -8,29 +7,207 @@ import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@ToString @Getter @Builder @With @AllArgsConstructor
|
||||
public class SearchSpecification {
|
||||
public SearchQuery query;
|
||||
|
||||
/** If present and not empty, limit the search to these domain IDs */
|
||||
/**
|
||||
* If present and not empty, limit the search to these domain IDs
|
||||
*/
|
||||
public List<Integer> domains;
|
||||
|
||||
public String searchSetIdentifier;
|
||||
|
||||
public final String humanQuery;
|
||||
|
||||
@Builder.Default
|
||||
public final SpecificationLimit quality = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit year = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit size = SpecificationLimit.none();
|
||||
@Builder.Default
|
||||
public final SpecificationLimit rank = SpecificationLimit.none();
|
||||
public SpecificationLimit quality;
|
||||
public SpecificationLimit year;
|
||||
public SpecificationLimit size;
|
||||
public SpecificationLimit rank;
|
||||
|
||||
public final QueryLimits queryLimits;
|
||||
|
||||
public final QueryStrategy queryStrategy;
|
||||
|
||||
public final ResultRankingParameters rankingParams;
|
||||
|
||||
public SearchSpecification(SearchQuery query,
|
||||
List<Integer> domains,
|
||||
String searchSetIdentifier,
|
||||
String humanQuery,
|
||||
SpecificationLimit quality,
|
||||
SpecificationLimit year,
|
||||
SpecificationLimit size,
|
||||
SpecificationLimit rank,
|
||||
QueryLimits queryLimits,
|
||||
QueryStrategy queryStrategy,
|
||||
ResultRankingParameters rankingParams)
|
||||
{
|
||||
this.query = query;
|
||||
this.domains = domains;
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
this.humanQuery = humanQuery;
|
||||
this.quality = quality;
|
||||
this.year = year;
|
||||
this.size = size;
|
||||
this.rank = rank;
|
||||
this.queryLimits = queryLimits;
|
||||
this.queryStrategy = queryStrategy;
|
||||
this.rankingParams = rankingParams;
|
||||
}
|
||||
|
||||
public static SearchSpecificationBuilder builder() {
|
||||
return new SearchSpecificationBuilder();
|
||||
}
|
||||
|
||||
public SearchQuery getQuery() {
|
||||
return this.query;
|
||||
}
|
||||
|
||||
public List<Integer> getDomains() {
|
||||
return this.domains;
|
||||
}
|
||||
|
||||
public String getSearchSetIdentifier() {
|
||||
return this.searchSetIdentifier;
|
||||
}
|
||||
|
||||
public String getHumanQuery() {
|
||||
return this.humanQuery;
|
||||
}
|
||||
|
||||
public SpecificationLimit getQuality() {
|
||||
return this.quality;
|
||||
}
|
||||
|
||||
public SpecificationLimit getYear() {
|
||||
return this.year;
|
||||
}
|
||||
|
||||
public SpecificationLimit getSize() {
|
||||
return this.size;
|
||||
}
|
||||
|
||||
public SpecificationLimit getRank() {
|
||||
return this.rank;
|
||||
}
|
||||
|
||||
public QueryLimits getQueryLimits() {
|
||||
return this.queryLimits;
|
||||
}
|
||||
|
||||
public QueryStrategy getQueryStrategy() {
|
||||
return this.queryStrategy;
|
||||
}
|
||||
|
||||
public ResultRankingParameters getRankingParams() {
|
||||
return this.rankingParams;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SearchSpecification(query=" + this.getQuery() + ", domains=" + this.getDomains() + ", searchSetIdentifier=" + this.getSearchSetIdentifier() + ", humanQuery=" + this.getHumanQuery() + ", quality=" + this.getQuality() + ", year=" + this.getYear() + ", size=" + this.getSize() + ", rank=" + this.getRank() + ", queryLimits=" + this.getQueryLimits() + ", queryStrategy=" + this.getQueryStrategy() + ", rankingParams=" + this.getRankingParams() + ")";
|
||||
}
|
||||
|
||||
public static class SearchSpecificationBuilder {
|
||||
private SearchQuery query;
|
||||
private List<Integer> domains;
|
||||
private String searchSetIdentifier;
|
||||
private String humanQuery;
|
||||
private SpecificationLimit quality$value;
|
||||
private boolean quality$set;
|
||||
private SpecificationLimit year$value;
|
||||
private boolean year$set;
|
||||
private SpecificationLimit size$value;
|
||||
private boolean size$set;
|
||||
private SpecificationLimit rank$value;
|
||||
private boolean rank$set;
|
||||
private QueryLimits queryLimits;
|
||||
private QueryStrategy queryStrategy;
|
||||
private ResultRankingParameters rankingParams;
|
||||
|
||||
SearchSpecificationBuilder() {
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder query(SearchQuery query) {
|
||||
this.query = query;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder domains(List<Integer> domains) {
|
||||
this.domains = domains;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder searchSetIdentifier(String searchSetIdentifier) {
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder humanQuery(String humanQuery) {
|
||||
this.humanQuery = humanQuery;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder quality(SpecificationLimit quality) {
|
||||
this.quality$value = quality;
|
||||
this.quality$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder year(SpecificationLimit year) {
|
||||
this.year$value = year;
|
||||
this.year$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder size(SpecificationLimit size) {
|
||||
this.size$value = size;
|
||||
this.size$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder rank(SpecificationLimit rank) {
|
||||
this.rank$value = rank;
|
||||
this.rank$set = true;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder queryLimits(QueryLimits queryLimits) {
|
||||
this.queryLimits = queryLimits;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder queryStrategy(QueryStrategy queryStrategy) {
|
||||
this.queryStrategy = queryStrategy;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecificationBuilder rankingParams(ResultRankingParameters rankingParams) {
|
||||
this.rankingParams = rankingParams;
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchSpecification build() {
|
||||
SpecificationLimit quality$value = this.quality$value;
|
||||
if (!this.quality$set) {
|
||||
quality$value = SpecificationLimit.none();
|
||||
}
|
||||
SpecificationLimit year$value = this.year$value;
|
||||
if (!this.year$set) {
|
||||
year$value = SpecificationLimit.none();
|
||||
}
|
||||
SpecificationLimit size$value = this.size$value;
|
||||
if (!this.size$set) {
|
||||
size$value = SpecificationLimit.none();
|
||||
}
|
||||
SpecificationLimit rank$value = this.rank$value;
|
||||
if (!this.rank$set) {
|
||||
rank$value = SpecificationLimit.none();
|
||||
}
|
||||
return new SearchSpecification(this.query, this.domains, this.searchSetIdentifier, this.humanQuery, quality$value, year$value, size$value, rank$value, this.queryLimits, this.queryStrategy, this.rankingParams);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "SearchSpecification.SearchSpecificationBuilder(query=" + this.query + ", domains=" + this.domains + ", searchSetIdentifier=" + this.searchSetIdentifier + ", humanQuery=" + this.humanQuery + ", quality$value=" + this.quality$value + ", year$value=" + this.year$value + ", size$value=" + this.size$value + ", rank$value=" + this.rank$value + ", queryLimits=" + this.queryLimits + ", queryStrategy=" + this.queryStrategy + ", rankingParams=" + this.rankingParams + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -9,8 +7,6 @@ import org.jetbrains.annotations.NotNull;
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
|
||||
@Getter
|
||||
@ToString
|
||||
public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResultItem> {
|
||||
public final SearchResultItem rawIndexResult;
|
||||
|
||||
@ -24,7 +20,9 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
@NotNull
|
||||
public final String format;
|
||||
|
||||
/** Document features bitmask, see HtmlFeature */
|
||||
/**
|
||||
* Document features bitmask, see HtmlFeature
|
||||
*/
|
||||
public final int features;
|
||||
|
||||
@Nullable
|
||||
@ -42,6 +40,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
public long documentId() {
|
||||
return rawIndexResult.getDocumentId();
|
||||
}
|
||||
|
||||
public int domainId() {
|
||||
return rawIndexResult.getDomainId();
|
||||
}
|
||||
@ -74,8 +73,7 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
int resultsFromDomain,
|
||||
@Nullable
|
||||
ResultRankingDetails rankingDetails
|
||||
)
|
||||
{
|
||||
) {
|
||||
this.rawIndexResult = rawIndexResult;
|
||||
this.url = url;
|
||||
this.title = title;
|
||||
@ -94,11 +92,73 @@ public class DecoratedSearchResultItem implements Comparable<DecoratedSearchResu
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull DecoratedSearchResultItem o) {
|
||||
int diff = Double.compare(rankingScore, o.rankingScore);
|
||||
int diff = Double.compare(rankingScore, o.rankingScore);
|
||||
|
||||
if (diff == 0)
|
||||
diff = Long.compare(documentId(), o.documentId());
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
public SearchResultItem getRawIndexResult() {
|
||||
return this.rawIndexResult;
|
||||
}
|
||||
|
||||
public @NotNull EdgeUrl getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
public @NotNull String getTitle() {
|
||||
return this.title;
|
||||
}
|
||||
|
||||
public @NotNull String getDescription() {
|
||||
return this.description;
|
||||
}
|
||||
|
||||
public double getUrlQuality() {
|
||||
return this.urlQuality;
|
||||
}
|
||||
|
||||
public @NotNull String getFormat() {
|
||||
return this.format;
|
||||
}
|
||||
|
||||
public int getFeatures() {
|
||||
return this.features;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public Integer getPubYear() {
|
||||
return this.pubYear;
|
||||
}
|
||||
|
||||
public long getDataHash() {
|
||||
return this.dataHash;
|
||||
}
|
||||
|
||||
public int getWordsTotal() {
|
||||
return this.wordsTotal;
|
||||
}
|
||||
|
||||
public long getBestPositions() {
|
||||
return this.bestPositions;
|
||||
}
|
||||
|
||||
public double getRankingScore() {
|
||||
return this.rankingScore;
|
||||
}
|
||||
|
||||
public int getResultsFromDomain() {
|
||||
return this.resultsFromDomain;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public ResultRankingDetails getRankingDetails() {
|
||||
return this.rankingDetails;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DecoratedSearchResultItem(rawIndexResult=" + this.getRawIndexResult() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", urlQuality=" + this.getUrlQuality() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", pubYear=" + this.getPubYear() + ", dataHash=" + this.getDataHash() + ", wordsTotal=" + this.getWordsTotal() + ", bestPositions=" + this.getBestPositions() + ", rankingScore=" + this.getRankingScore() + ", resultsFromDomain=" + this.getResultsFromDomain() + ", rankingDetails=" + this.getRankingDetails() + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,11 +1,9 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
|
||||
import java.util.BitSet;
|
||||
|
||||
@ToString
|
||||
public class ResultRankingContext {
|
||||
private final int docCount;
|
||||
public final ResultRankingParameters params;
|
||||
@ -43,4 +41,15 @@ public class ResultRankingContext {
|
||||
return docCount;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ResultRankingContext{" +
|
||||
"docCount=" + docCount +
|
||||
", params=" + params +
|
||||
", regularMask=" + regularMask +
|
||||
", ngramsMask=" + ngramsMask +
|
||||
", fullCounts=" + fullCounts +
|
||||
", priorityCounts=" + priorityCounts +
|
||||
'}';
|
||||
}
|
||||
}
|
||||
|
@ -1,33 +1,40 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.*;
|
||||
import java.util.Objects;
|
||||
|
||||
@Builder
|
||||
@AllArgsConstructor
|
||||
@ToString
|
||||
@EqualsAndHashCode
|
||||
@Getter // getter for the mustache template engine's behalf
|
||||
public class ResultRankingParameters {
|
||||
|
||||
/** Tuning for BM25 when applied to full document matches */
|
||||
/**
|
||||
* Tuning for BM25 when applied to full document matches
|
||||
*/
|
||||
public final Bm25Parameters bm25Params;
|
||||
|
||||
/** Documents below this length are penalized */
|
||||
/**
|
||||
* Documents below this length are penalized
|
||||
*/
|
||||
public int shortDocumentThreshold;
|
||||
|
||||
public double shortDocumentPenalty;
|
||||
|
||||
|
||||
/** Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good) */
|
||||
/**
|
||||
* Scaling factor associated with domain rank (unscaled rank value is 0-255; high is good)
|
||||
*/
|
||||
public double domainRankBonus;
|
||||
|
||||
/** Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad) */
|
||||
/**
|
||||
* Scaling factor associated with document quality (unscaled rank value is 0-15; high is bad)
|
||||
*/
|
||||
public double qualityPenalty;
|
||||
|
||||
/** Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want */
|
||||
/**
|
||||
* Average sentence length values below this threshold are penalized, range [0-4), 2 or 3 is probably what you want
|
||||
*/
|
||||
public int shortSentenceThreshold;
|
||||
|
||||
/** Magnitude of penalty for documents with low average sentence length */
|
||||
/**
|
||||
* Magnitude of penalty for documents with low average sentence length
|
||||
*/
|
||||
public double shortSentencePenalty;
|
||||
|
||||
public double bm25Weight;
|
||||
@ -40,13 +47,30 @@ public class ResultRankingParameters {
|
||||
|
||||
public boolean exportDebugData;
|
||||
|
||||
public ResultRankingParameters(Bm25Parameters bm25Params, int shortDocumentThreshold, double shortDocumentPenalty, double domainRankBonus, double qualityPenalty, int shortSentenceThreshold, double shortSentencePenalty, double bm25Weight, double tcfFirstPosition, double tcfVerbatim, double tcfProximity, TemporalBias temporalBias, double temporalBiasWeight, boolean exportDebugData) {
|
||||
this.bm25Params = bm25Params;
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
this.domainRankBonus = domainRankBonus;
|
||||
this.qualityPenalty = qualityPenalty;
|
||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
||||
this.shortSentencePenalty = shortSentencePenalty;
|
||||
this.bm25Weight = bm25Weight;
|
||||
this.tcfFirstPosition = tcfFirstPosition;
|
||||
this.tcfVerbatim = tcfVerbatim;
|
||||
this.tcfProximity = tcfProximity;
|
||||
this.temporalBias = temporalBias;
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
this.exportDebugData = exportDebugData;
|
||||
}
|
||||
|
||||
public static ResultRankingParameters sensibleDefaults() {
|
||||
return builder()
|
||||
.bm25Params(new Bm25Parameters(1.2, 0.5))
|
||||
.shortDocumentThreshold(2000)
|
||||
.shortDocumentPenalty(2.)
|
||||
.domainRankBonus(1/25.)
|
||||
.qualityPenalty(1/15.)
|
||||
.domainRankBonus(1 / 25.)
|
||||
.qualityPenalty(1 / 15.)
|
||||
.shortSentenceThreshold(2)
|
||||
.shortSentencePenalty(5)
|
||||
.bm25Weight(1.)
|
||||
@ -59,7 +83,196 @@ public class ResultRankingParameters {
|
||||
.build();
|
||||
}
|
||||
|
||||
public static ResultRankingParametersBuilder builder() {
|
||||
return new ResultRankingParametersBuilder();
|
||||
}
|
||||
|
||||
public Bm25Parameters getBm25Params() {
|
||||
return this.bm25Params;
|
||||
}
|
||||
|
||||
public int getShortDocumentThreshold() {
|
||||
return this.shortDocumentThreshold;
|
||||
}
|
||||
|
||||
public double getShortDocumentPenalty() {
|
||||
return this.shortDocumentPenalty;
|
||||
}
|
||||
|
||||
public double getDomainRankBonus() {
|
||||
return this.domainRankBonus;
|
||||
}
|
||||
|
||||
public double getQualityPenalty() {
|
||||
return this.qualityPenalty;
|
||||
}
|
||||
|
||||
public int getShortSentenceThreshold() {
|
||||
return this.shortSentenceThreshold;
|
||||
}
|
||||
|
||||
public double getShortSentencePenalty() {
|
||||
return this.shortSentencePenalty;
|
||||
}
|
||||
|
||||
public double getBm25Weight() {
|
||||
return this.bm25Weight;
|
||||
}
|
||||
|
||||
public double getTcfFirstPosition() {
|
||||
return this.tcfFirstPosition;
|
||||
}
|
||||
|
||||
public double getTcfVerbatim() {
|
||||
return this.tcfVerbatim;
|
||||
}
|
||||
|
||||
public double getTcfProximity() {
|
||||
return this.tcfProximity;
|
||||
}
|
||||
|
||||
public TemporalBias getTemporalBias() {
|
||||
return this.temporalBias;
|
||||
}
|
||||
|
||||
public double getTemporalBiasWeight() {
|
||||
return this.temporalBiasWeight;
|
||||
}
|
||||
|
||||
public boolean isExportDebugData() {
|
||||
return this.exportDebugData;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean equals(Object o) {
|
||||
if (this == o) return true;
|
||||
if (!(o instanceof ResultRankingParameters that)) return false;
|
||||
|
||||
return shortDocumentThreshold == that.shortDocumentThreshold && Double.compare(shortDocumentPenalty, that.shortDocumentPenalty) == 0 && Double.compare(domainRankBonus, that.domainRankBonus) == 0 && Double.compare(qualityPenalty, that.qualityPenalty) == 0 && shortSentenceThreshold == that.shortSentenceThreshold && Double.compare(shortSentencePenalty, that.shortSentencePenalty) == 0 && Double.compare(bm25Weight, that.bm25Weight) == 0 && Double.compare(tcfFirstPosition, that.tcfFirstPosition) == 0 && Double.compare(tcfVerbatim, that.tcfVerbatim) == 0 && Double.compare(tcfProximity, that.tcfProximity) == 0 && Double.compare(temporalBiasWeight, that.temporalBiasWeight) == 0 && exportDebugData == that.exportDebugData && Objects.equals(bm25Params, that.bm25Params) && temporalBias == that.temporalBias;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = Objects.hashCode(bm25Params);
|
||||
result = 31 * result + shortDocumentThreshold;
|
||||
result = 31 * result + Double.hashCode(shortDocumentPenalty);
|
||||
result = 31 * result + Double.hashCode(domainRankBonus);
|
||||
result = 31 * result + Double.hashCode(qualityPenalty);
|
||||
result = 31 * result + shortSentenceThreshold;
|
||||
result = 31 * result + Double.hashCode(shortSentencePenalty);
|
||||
result = 31 * result + Double.hashCode(bm25Weight);
|
||||
result = 31 * result + Double.hashCode(tcfFirstPosition);
|
||||
result = 31 * result + Double.hashCode(tcfVerbatim);
|
||||
result = 31 * result + Double.hashCode(tcfProximity);
|
||||
result = 31 * result + Objects.hashCode(temporalBias);
|
||||
result = 31 * result + Double.hashCode(temporalBiasWeight);
|
||||
result = 31 * result + Boolean.hashCode(exportDebugData);
|
||||
return result;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters(bm25Params=" + this.getBm25Params() + ", shortDocumentThreshold=" + this.getShortDocumentThreshold() + ", shortDocumentPenalty=" + this.getShortDocumentPenalty() + ", domainRankBonus=" + this.getDomainRankBonus() + ", qualityPenalty=" + this.getQualityPenalty() + ", shortSentenceThreshold=" + this.getShortSentenceThreshold() + ", shortSentencePenalty=" + this.getShortSentencePenalty() + ", bm25Weight=" + this.getBm25Weight() + ", tcfFirstPosition=" + this.getTcfFirstPosition() + ", tcfVerbatim=" + this.getTcfVerbatim() + ", tcfProximity=" + this.getTcfProximity() + ", temporalBias=" + this.getTemporalBias() + ", temporalBiasWeight=" + this.getTemporalBiasWeight() + ", exportDebugData=" + this.isExportDebugData() + ")";
|
||||
}
|
||||
|
||||
public enum TemporalBias {
|
||||
RECENT, OLD, NONE
|
||||
}
|
||||
|
||||
public static class ResultRankingParametersBuilder {
|
||||
private Bm25Parameters bm25Params;
|
||||
private int shortDocumentThreshold;
|
||||
private double shortDocumentPenalty;
|
||||
private double domainRankBonus;
|
||||
private double qualityPenalty;
|
||||
private int shortSentenceThreshold;
|
||||
private double shortSentencePenalty;
|
||||
private double bm25Weight;
|
||||
private double tcfFirstPosition;
|
||||
private double tcfVerbatim;
|
||||
private double tcfProximity;
|
||||
private TemporalBias temporalBias;
|
||||
private double temporalBiasWeight;
|
||||
private boolean exportDebugData;
|
||||
|
||||
ResultRankingParametersBuilder() {
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder bm25Params(Bm25Parameters bm25Params) {
|
||||
this.bm25Params = bm25Params;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortDocumentThreshold(int shortDocumentThreshold) {
|
||||
this.shortDocumentThreshold = shortDocumentThreshold;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortDocumentPenalty(double shortDocumentPenalty) {
|
||||
this.shortDocumentPenalty = shortDocumentPenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder domainRankBonus(double domainRankBonus) {
|
||||
this.domainRankBonus = domainRankBonus;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder qualityPenalty(double qualityPenalty) {
|
||||
this.qualityPenalty = qualityPenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortSentenceThreshold(int shortSentenceThreshold) {
|
||||
this.shortSentenceThreshold = shortSentenceThreshold;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder shortSentencePenalty(double shortSentencePenalty) {
|
||||
this.shortSentencePenalty = shortSentencePenalty;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder bm25Weight(double bm25Weight) {
|
||||
this.bm25Weight = bm25Weight;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfFirstPosition(double tcfFirstPosition) {
|
||||
this.tcfFirstPosition = tcfFirstPosition;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfVerbatim(double tcfVerbatim) {
|
||||
this.tcfVerbatim = tcfVerbatim;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder tcfProximity(double tcfProximity) {
|
||||
this.tcfProximity = tcfProximity;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder temporalBias(TemporalBias temporalBias) {
|
||||
this.temporalBias = temporalBias;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder temporalBiasWeight(double temporalBiasWeight) {
|
||||
this.temporalBiasWeight = temporalBiasWeight;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParametersBuilder exportDebugData(boolean exportDebugData) {
|
||||
this.exportDebugData = exportDebugData;
|
||||
return this;
|
||||
}
|
||||
|
||||
public ResultRankingParameters build() {
|
||||
return new ResultRankingParameters(this.bm25Params, this.shortDocumentThreshold, this.shortDocumentPenalty, this.domainRankBonus, this.qualityPenalty, this.shortSentenceThreshold, this.shortSentencePenalty, this.bm25Weight, this.tcfFirstPosition, this.tcfVerbatim, this.tcfProximity, this.temporalBias, this.temporalBiasWeight, this.exportDebugData);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ResultRankingParameters.ResultRankingParametersBuilder(bm25Params=" + this.bm25Params + ", shortDocumentThreshold=" + this.shortDocumentThreshold + ", shortDocumentPenalty=" + this.shortDocumentPenalty + ", domainRankBonus=" + this.domainRankBonus + ", qualityPenalty=" + this.qualityPenalty + ", shortSentenceThreshold=" + this.shortSentenceThreshold + ", shortSentencePenalty=" + this.shortSentencePenalty + ", bm25Weight=" + this.bm25Weight + ", tcfFirstPosition=" + this.tcfFirstPosition + ", tcfVerbatim=" + this.tcfVerbatim + ", tcfProximity=" + this.tcfProximity + ", temporalBias=" + this.temporalBias + ", temporalBiasWeight=" + this.temporalBiasWeight + ", exportDebugData=" + this.exportDebugData + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,5 @@
|
||||
package nu.marginalia.api.searchquery.model.results;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
@ -9,21 +7,30 @@ import org.jetbrains.annotations.NotNull;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Represents a document matching a search query */
|
||||
@AllArgsConstructor @Getter
|
||||
/**
|
||||
* Represents a document matching a search query
|
||||
*/
|
||||
public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
/** Encoded ID that contains both the URL id and its ranking. This is
|
||||
* probably not what you want, use getDocumentId() instead */
|
||||
/**
|
||||
* Encoded ID that contains both the URL id and its ranking. This is
|
||||
* probably not what you want, use getDocumentId() instead
|
||||
*/
|
||||
public final long combinedId;
|
||||
|
||||
/** Encoded document metadata */
|
||||
/**
|
||||
* Encoded document metadata
|
||||
*/
|
||||
public final long encodedDocMetadata;
|
||||
|
||||
/** Encoded html features of document */
|
||||
/**
|
||||
* Encoded html features of document
|
||||
*/
|
||||
|
||||
public final int htmlFeatures;
|
||||
|
||||
/** How did the subqueries match against the document ? */
|
||||
/**
|
||||
* How did the subqueries match against the document ?
|
||||
*/
|
||||
public final List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
public boolean hasPrioTerm;
|
||||
@ -45,6 +52,17 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
this.scoreValue = score;
|
||||
}
|
||||
|
||||
public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures, List<SearchResultKeywordScore> keywordScores, boolean hasPrioTerm, long bestPositions, DebugRankingFactors debugRankingFactors, double scoreValue) {
|
||||
this.combinedId = combinedId;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.keywordScores = keywordScores;
|
||||
this.hasPrioTerm = hasPrioTerm;
|
||||
this.bestPositions = bestPositions;
|
||||
this.debugRankingFactors = debugRankingFactors;
|
||||
this.scoreValue = scoreValue;
|
||||
}
|
||||
|
||||
|
||||
public long getDocumentId() {
|
||||
return UrlIdCodec.removeRank(combinedId);
|
||||
@ -56,9 +74,11 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
/* Used for evaluation */
|
||||
private transient double scoreValue = Double.MAX_VALUE;
|
||||
|
||||
public void setScore(double score) {
|
||||
scoreValue = score;
|
||||
}
|
||||
|
||||
public double getScore() {
|
||||
return scoreValue;
|
||||
}
|
||||
@ -81,7 +101,7 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
if (other == this)
|
||||
return true;
|
||||
if (other instanceof SearchResultItem o) {
|
||||
return o.getDocumentId() == getDocumentId();
|
||||
return o.getDocumentId() == getDocumentId();
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -96,4 +116,35 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
}
|
||||
|
||||
|
||||
public long getCombinedId() {
|
||||
return this.combinedId;
|
||||
}
|
||||
|
||||
public long getEncodedDocMetadata() {
|
||||
return this.encodedDocMetadata;
|
||||
}
|
||||
|
||||
public int getHtmlFeatures() {
|
||||
return this.htmlFeatures;
|
||||
}
|
||||
|
||||
public List<SearchResultKeywordScore> getKeywordScores() {
|
||||
return this.keywordScores;
|
||||
}
|
||||
|
||||
public boolean isHasPrioTerm() {
|
||||
return this.hasPrioTerm;
|
||||
}
|
||||
|
||||
public long getBestPositions() {
|
||||
return this.bestPositions;
|
||||
}
|
||||
|
||||
public DebugRankingFactors getDebugRankingFactors() {
|
||||
return this.debugRankingFactors;
|
||||
}
|
||||
|
||||
public double getScoreValue() {
|
||||
return this.scoreValue;
|
||||
}
|
||||
}
|
||||
|
@ -48,7 +48,7 @@ public record QWord(
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return STR."q{\{word}}";
|
||||
return "q{" + word + "}";
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -238,7 +238,7 @@ public class QWordGraph implements Iterable<QWord> {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("digraph {\n");
|
||||
for (var link : links) {
|
||||
sb.append(STR."\"\{link.from().word()}\" -> \"\{link.to.word()}\";\n");
|
||||
sb.append("\"" + link.from().word() + "\" -> \"" + link.to.word() + "\";\n");
|
||||
}
|
||||
sb.append("}\n");
|
||||
return sb.toString();
|
||||
|
@ -63,6 +63,6 @@ public class QWordPath {
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return STR."WordPath{nodes=\{nodes}\{'}'}";
|
||||
return "WordPath{nodes=" + nodes + '}';
|
||||
}
|
||||
}
|
||||
|
@ -127,7 +127,7 @@ public class QWordPathsRenderer {
|
||||
// Recurse into the branches:
|
||||
String branchPart = render(e.getValue(), reachability);
|
||||
|
||||
return STR."\{commonWord} \{branchPart}";
|
||||
return commonWord + " " + branchPart;
|
||||
})
|
||||
.collect(Collectors.joining(" | ", " ( ", " ) "));
|
||||
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.api;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
@ -51,7 +50,6 @@ public class IndexClient {
|
||||
) {}
|
||||
|
||||
/** Execute a query on the index partitions and return the combined results. */
|
||||
@SneakyThrows
|
||||
public AggregateQueryResponse executeQueries(RpcIndexQuery indexRequest, Pagination pagination) {
|
||||
List<CompletableFuture<Iterator<RpcDecoratedResultItem>>> futures =
|
||||
channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query)
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
@ -40,9 +39,9 @@ class ForwardIndexConverterTest {
|
||||
private Path docsSpanData;
|
||||
|
||||
int workSetSize = 512;
|
||||
|
||||
@BeforeEach
|
||||
@SneakyThrows
|
||||
void setUp() {
|
||||
void setUp() throws Exception {
|
||||
|
||||
workDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||
|
||||
@ -75,7 +74,7 @@ class ForwardIndexConverterTest {
|
||||
return UrlIdCodec.encodeId((int) domain, (int) url);
|
||||
}
|
||||
|
||||
public void createEntry(IndexJournalSlopWriter writer, int id) {
|
||||
public void createEntry(IndexJournalSlopWriter writer, int id) throws IOException {
|
||||
writer.put(
|
||||
createId(id, id/20),
|
||||
new SlopDocumentRecord.KeywordsProjection(
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.model.processed.SlopDocumentRecord;
|
||||
import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn;
|
||||
@ -53,8 +52,7 @@ public class IndexJournalSlopWriter extends SlopTable {
|
||||
spansWriter = IndexJournalPage.spans.create(this);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) {
|
||||
public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) throws IOException {
|
||||
|
||||
combinedIdWriter.put(combinedId);
|
||||
featuresWriter.put(keywordsProjection.htmlFeatures());
|
||||
|
@ -1,10 +1,9 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.sequence.io.BitReader;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
@ -62,7 +61,6 @@ public class PrioIndexEntrySource implements EntrySource {
|
||||
}
|
||||
|
||||
@Override
|
||||
@SneakyThrows
|
||||
@SuppressWarnings("preview")
|
||||
public void read(LongQueryBuffer buffer) {
|
||||
var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
@ -78,35 +77,48 @@ public class FullIndexConstructor {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) {
|
||||
return FullPreindex
|
||||
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
|
||||
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return FullPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
return FullPreindex
|
||||
.constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
catch (IOException e) {
|
||||
logger.error("Error constructing preindex", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) {
|
||||
try {
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return FullPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
} finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
}
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error merging preindex", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void finalizeIndex(FullPreindexReference finalPR) {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
try {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
}
|
||||
catch (IOException e) {
|
||||
logger.error("Error finalizing index", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.full;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
@ -113,7 +112,6 @@ public class FullPreindexDocuments {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalPage;
|
||||
@ -73,35 +72,47 @@ public class PrioIndexConstructor {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private PrioPreindexReference construct(IndexJournalPage journalInstance) {
|
||||
return PrioPreindex
|
||||
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
|
||||
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
return PrioPreindex
|
||||
.constructPreindex(journalInstance, docIdRewriter, tmpDir)
|
||||
.closeToReference();
|
||||
}
|
||||
finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to construct preindex", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) {
|
||||
try {
|
||||
var left = leftR.open();
|
||||
var right = rightR.open();
|
||||
|
||||
try {
|
||||
return PrioPreindex.merge(tmpDir, left, right).closeToReference();
|
||||
} finally {
|
||||
left.delete();
|
||||
right.delete();
|
||||
}
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to merge preindex", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void finalizeIndex(PrioPreindexReference finalPR) {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
try {
|
||||
var finalP = finalPR.open();
|
||||
finalP.finalizeIndex(outputFileDocs, outputFileWords);
|
||||
finalP.delete();
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to finalize preindex", ex);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.construction.prio;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
@ -97,7 +96,6 @@ public class PrioPreindexDocuments {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) {
|
||||
|
||||
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||
|
@ -7,7 +7,6 @@ import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Gauge;
|
||||
import io.prometheus.client.Histogram;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.searchquery.IndexApiGrpc;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.RpcIndexQuery;
|
||||
@ -109,7 +108,6 @@ public class IndexGrpcService
|
||||
}
|
||||
|
||||
// GRPC endpoint
|
||||
@SneakyThrows
|
||||
public void query(RpcIndexQuery request,
|
||||
StreamObserver<RpcDecoratedResultItem> responseObserver) {
|
||||
|
||||
@ -157,9 +155,14 @@ public class IndexGrpcService
|
||||
|
||||
|
||||
// exists for test access
|
||||
@SneakyThrows
|
||||
List<RpcDecoratedResultItem> justQuery(SearchSpecification specsSet) {
|
||||
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
|
||||
try {
|
||||
return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet)));
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in handling request", ex);
|
||||
return List.of();
|
||||
}
|
||||
}
|
||||
|
||||
private SearchSet getSearchSet(SearchSpecification specsSet) {
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
@ -20,27 +19,32 @@ public class InvertedLinkGraphSource extends AbstractGraphSource {
|
||||
super(dataSource);
|
||||
this.graphClient = graphClient;
|
||||
}
|
||||
@SneakyThrows
|
||||
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
try {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Invert the edge
|
||||
graph.addEdge(iter.dest(), iter.source());
|
||||
}
|
||||
|
||||
// Invert the edge
|
||||
graph.addEdge(iter.dest(), iter.source());
|
||||
return graph;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.api.linkgraph.AggregateLinkGraphClient;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultDirectedGraph;
|
||||
@ -18,26 +17,31 @@ public class LinkGraphSource extends AbstractGraphSource {
|
||||
this.graphClient = graphClient;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
try {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
addVertices(graph);
|
||||
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
var allLinks = graphClient.getAllDomainLinks();
|
||||
var iter = allLinks.iterator();
|
||||
while (iter.advance()) {
|
||||
if (!graph.containsVertex(iter.dest())) {
|
||||
continue;
|
||||
}
|
||||
if (!graph.containsVertex(iter.source())) {
|
||||
continue;
|
||||
}
|
||||
|
||||
graph.addEdge(iter.source(), iter.dest());
|
||||
}
|
||||
|
||||
graph.addEdge(iter.source(), iter.dest());
|
||||
return graph;
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
}
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.ranking.domains.data;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import lombok.SneakyThrows;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
@ -35,14 +34,13 @@ public class SimilarityGraphSource extends AbstractGraphSource {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||
|
||||
addVertices(graph);
|
||||
|
||||
try (var conn = dataSource.getConnection()) {
|
||||
addVertices(graph);
|
||||
|
||||
try (var stmt = conn.prepareStatement("""
|
||||
SELECT DOMAIN_ID, NEIGHBOR_ID, RELATEDNESS
|
||||
FROM EC_DOMAIN_NEIGHBORS_2
|
||||
@ -67,6 +65,9 @@ public class SimilarityGraphSource extends AbstractGraphSource {
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
@ -311,7 +311,9 @@ public class CombinedIndexReaderTest {
|
||||
}
|
||||
|
||||
void load() throws IOException, SQLException, URISyntaxException {
|
||||
allData.forEach((doc, words) -> {
|
||||
for (Map.Entry<Long, List<MockDataKeyword>> entry : allData.entrySet()) {
|
||||
final Long doc = entry.getKey();
|
||||
final List<MockDataKeyword> words = entry.getValue();
|
||||
|
||||
var meta = metaByDoc.get(doc);
|
||||
|
||||
@ -320,7 +322,7 @@ public class CombinedIndexReaderTest {
|
||||
for (int i = 0; i < words.size(); i++) {
|
||||
metadata[i] = words.get(i).termMetadata;
|
||||
}
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList();
|
||||
|
||||
indexJournalWriter.put(doc,
|
||||
new SlopDocumentRecord.KeywordsProjection(
|
||||
@ -335,7 +337,7 @@ public class CombinedIndexReaderTest {
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.api.searchquery.RpcDecoratedResultItem;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint;
|
||||
@ -378,8 +377,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadData(DocumentDbWriter ldbw, int id) {
|
||||
public void loadData(DocumentDbWriter ldbw, int id) throws Exception {
|
||||
int[] factors = IntStream
|
||||
.rangeClosed(1, id)
|
||||
.filter(v -> (id % v) == 0)
|
||||
@ -423,8 +421,7 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
|
||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) throws Exception {
|
||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||
long fullId = UrlIdCodec.encodeId(domain, id);
|
||||
|
||||
|
@ -532,8 +532,9 @@ public class IndexQueryServiceIntegrationTest {
|
||||
}
|
||||
|
||||
void load() throws IOException, SQLException, URISyntaxException {
|
||||
allData.forEach((doc, words) -> {
|
||||
|
||||
for (Map.Entry<Long, List<MockDataKeyword>> entry : allData.entrySet()) {
|
||||
Long doc = entry.getKey();
|
||||
List<MockDataKeyword> words = entry.getValue();
|
||||
var meta = metaByDoc.get(doc);
|
||||
|
||||
List<String> keywords = words.stream().map(w -> w.keyword).toList();
|
||||
@ -561,7 +562,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
new byte[0],
|
||||
List.of()
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@ -43,14 +42,14 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
idToName = new HashMap<>();
|
||||
|
||||
try (var stream = Files
|
||||
.lines(domainDataPath)) {
|
||||
.lines(domainDataPath))
|
||||
{
|
||||
|
||||
stream.skip(1)
|
||||
.mapMultiToInt((line, c) -> {
|
||||
@ -65,6 +64,9 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
for (var path : linksDataPaths) {
|
||||
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||
@ -80,8 +82,12 @@ public class TestGraphSourceForInvertedLinkData implements GraphSource {
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
@ -44,7 +43,6 @@ public class TestGraphSourceForLinkData implements GraphSource {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultDirectedGraph<>(DefaultEdge.class);
|
||||
@ -66,6 +64,9 @@ public class TestGraphSourceForLinkData implements GraphSource {
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
for (var path : linksDataPaths) {
|
||||
try (var data = LongArrayFactory.mmapForReadingConfined(path)) {
|
||||
@ -81,8 +82,12 @@ public class TestGraphSourceForLinkData implements GraphSource {
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
||||
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.ranking.domains;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.ranking.domains.data.GraphSource;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.jgrapht.Graph;
|
||||
import org.jgrapht.graph.DefaultUndirectedWeightedGraph;
|
||||
import org.jgrapht.graph.DefaultWeightedEdge;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
@ -33,7 +33,6 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
return idToName.get(id);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Graph<Integer, ?> getGraph() {
|
||||
Graph<Integer, ?> graph = new DefaultUndirectedWeightedGraph<>(DefaultWeightedEdge.class);
|
||||
@ -55,6 +54,9 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
})
|
||||
.forEach(graph::addVertex);
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
try (var stream = Files
|
||||
.lines(similarityDataPath)) {
|
||||
@ -71,6 +73,9 @@ public class TestGraphSourceForSimilarityData implements GraphSource {
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return graph;
|
||||
}
|
||||
|
@ -148,7 +148,7 @@ public class SegmentLongArray implements LongArray {
|
||||
final int stride = 1024*1024*128; // Copy 1 GB at a time 'cause byte buffers are 'a byte buffering
|
||||
|
||||
if (source.size() / 8 < sourceStart + (arrayEnd - arrayStart)) {
|
||||
throw new IndexOutOfBoundsException(STR."Source channel too small: \{source.size()} < \{sourceStart + (arrayEnd - arrayStart)}");
|
||||
throw new IndexOutOfBoundsException("Source channel too small: " + source.size() + " < " + (sourceStart + (arrayEnd - arrayStart)));
|
||||
}
|
||||
|
||||
long ss = sourceStart;
|
||||
|
@ -1,12 +1,12 @@
|
||||
package nu.marginalia.util;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.concurrent.*;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@ -23,12 +23,16 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
|
||||
private T next = null;
|
||||
|
||||
@SneakyThrows
|
||||
ProcessingIterator(SimpleBlockingThreadPool pool, int queueSize, ProcessingJob<T> task) {
|
||||
queue = new LinkedBlockingQueue<>(queueSize);
|
||||
this.pool = pool;
|
||||
|
||||
pool.submit(() -> executeJob(task));
|
||||
try {
|
||||
pool.submit(() -> executeJob(task));
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
}
|
||||
|
||||
public static Factory factory(int queueSize, int parallelism) {
|
||||
@ -45,15 +49,19 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
}
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private void executeTask(Task<T> task) {
|
||||
pool.submit(() -> {
|
||||
try {
|
||||
queue.put(task.get());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
});
|
||||
try {
|
||||
pool.submit(() -> {
|
||||
try {
|
||||
queue.put(task.get());
|
||||
} catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
});
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.warn("Exception while processing", e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if there are more documents to be processed.
|
||||
@ -63,17 +71,21 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
* (or synchronize between the two)
|
||||
*/
|
||||
@Override
|
||||
@SneakyThrows
|
||||
public boolean hasNext() {
|
||||
if (next != null)
|
||||
return true;
|
||||
|
||||
do {
|
||||
next = queue.poll(50, TimeUnit.MILLISECONDS);
|
||||
if (next != null) {
|
||||
return true;
|
||||
}
|
||||
} while (expectMore());
|
||||
try {
|
||||
do {
|
||||
next = queue.poll(50, TimeUnit.MILLISECONDS);
|
||||
if (next != null) {
|
||||
return true;
|
||||
}
|
||||
} while (expectMore());
|
||||
}
|
||||
catch (InterruptedException e) {
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
@ -96,7 +108,6 @@ public class ProcessingIterator<T> implements Iterator<T> {
|
||||
* <p>
|
||||
* If this is run after hasNext() returns false, a NoSuchElementException is thrown.
|
||||
*/
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public T next() {
|
||||
if (!hasNext()) {
|
||||
|
@ -7,7 +7,7 @@ import org.junit.jupiter.api.Test;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.Random;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class BitWriterTest {
|
||||
|
||||
@ -52,7 +52,7 @@ class BitWriterTest {
|
||||
byte actual = out.get(0);
|
||||
byte expected = (byte) 0b1011_1110;
|
||||
|
||||
assertEquals(expected, actual, STR."was \{Integer.toBinaryString(actual & 0xFF)}");
|
||||
assertEquals(expected, actual, "was " + Integer.toBinaryString(actual & 0xFF));
|
||||
assertEquals(1, out.limit());
|
||||
}
|
||||
|
||||
@ -84,8 +84,8 @@ class BitWriterTest {
|
||||
byte expected1 = (byte) 0b1011_1110;
|
||||
byte expected2 = (byte) 0b1100_0000;
|
||||
|
||||
assertEquals(expected1, actual1, STR."was \{Integer.toBinaryString(actual1 & 0xFF)}");
|
||||
assertEquals(expected2, actual2, STR."was \{Integer.toBinaryString(actual2 & 0xFF)}");
|
||||
assertEquals(expected1, actual1, "was " + Integer.toBinaryString(actual1 & 0xFF));
|
||||
assertEquals(expected2, actual2, "was " + Integer.toBinaryString(actual2 & 0xFF));
|
||||
|
||||
}
|
||||
|
||||
@ -118,13 +118,13 @@ class BitWriterTest {
|
||||
byte actual1 = out.get(i);
|
||||
byte expected1 = (byte) 0b1011_1110;
|
||||
|
||||
assertEquals(expected1, actual1, STR."was \{Integer.toBinaryString(actual1 & 0xFF)}");
|
||||
assertEquals(expected1, actual1, "was " + Integer.toBinaryString(actual1 & 0xFF));
|
||||
}
|
||||
|
||||
byte actual2 = out.get(4);
|
||||
byte expected2 = (byte) 0b1100_0000;
|
||||
|
||||
assertEquals(expected2, actual2, STR."was \{Integer.toBinaryString(actual2 & 0xFF)}");
|
||||
assertEquals(expected2, actual2, "was " + Integer.toBinaryString(actual2 & 0xFF));
|
||||
|
||||
}
|
||||
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.language.filter;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.encoding.UnicodeRanges;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
@ -45,7 +44,6 @@ public class LanguageFilter {
|
||||
}
|
||||
|
||||
@Inject
|
||||
@SneakyThrows
|
||||
public LanguageFilter(LanguageModels lm) {
|
||||
try {
|
||||
languagePredictionModel1 = new UngaBungaLanguagePredictionModel();
|
||||
|
@ -1,12 +1,9 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
@AllArgsConstructor @Getter
|
||||
public class WordRep implements Comparable<WordRep> {
|
||||
|
||||
public WordRep(DocumentSentence sent, WordSpan span) {
|
||||
@ -22,6 +19,13 @@ public class WordRep implements Comparable<WordRep> {
|
||||
public final String stemmed;
|
||||
private final int hashCode;
|
||||
|
||||
public WordRep(int length, String word, String stemmed, int hashCode) {
|
||||
this.length = length;
|
||||
this.word = word;
|
||||
this.stemmed = stemmed;
|
||||
this.hashCode = hashCode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull WordRep o) {
|
||||
return word.compareTo(o.word);
|
||||
@ -43,4 +47,20 @@ public class WordRep implements Comparable<WordRep> {
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public int getLength() {
|
||||
return this.length;
|
||||
}
|
||||
|
||||
public String getWord() {
|
||||
return this.word;
|
||||
}
|
||||
|
||||
public String getStemmed() {
|
||||
return this.stemmed;
|
||||
}
|
||||
|
||||
public int getHashCode() {
|
||||
return this.hashCode;
|
||||
}
|
||||
}
|
||||
|
@ -1,17 +1,20 @@
|
||||
package nu.marginalia.language.model;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
@AllArgsConstructor @EqualsAndHashCode
|
||||
public class WordSpan implements Comparable<WordSpan>{
|
||||
public class WordSpan implements Comparable<WordSpan> {
|
||||
public final int start;
|
||||
public final int end;
|
||||
|
||||
public WordSpan(int start, int end) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return end - start;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull WordSpan o) {
|
||||
return start - o.start;
|
||||
@ -30,8 +33,7 @@ public class WordSpan implements Comparable<WordSpan>{
|
||||
}
|
||||
if (start < other.start) {
|
||||
return end - other.start;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return other.end - start;
|
||||
}
|
||||
|
||||
@ -40,4 +42,26 @@ public class WordSpan implements Comparable<WordSpan>{
|
||||
public String toString() {
|
||||
return String.format("WordSpan[%s,%s]", start, end);
|
||||
}
|
||||
|
||||
public boolean equals(final Object o) {
|
||||
if (o == this) return true;
|
||||
if (!(o instanceof WordSpan)) return false;
|
||||
final WordSpan other = (WordSpan) o;
|
||||
if (!other.canEqual((Object) this)) return false;
|
||||
if (this.start != other.start) return false;
|
||||
if (this.end != other.end) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
protected boolean canEqual(final Object other) {
|
||||
return other instanceof WordSpan;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
final int PRIME = 59;
|
||||
int result = 1;
|
||||
result = result * PRIME + this.start;
|
||||
result = result * PRIME + this.end;
|
||||
return result;
|
||||
}
|
||||
}
|
@ -2,7 +2,6 @@ package nu.marginalia.language.sentence;
|
||||
|
||||
import com.github.datquocnguyen.RDRPOSTagger;
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.language.model.DocumentLanguageData;
|
||||
import nu.marginalia.language.model.DocumentSentence;
|
||||
@ -46,7 +45,7 @@ public class SentenceExtractor {
|
||||
static final int MAX_SENTENCE_LENGTH = 250;
|
||||
static final int MAX_SENTENCE_COUNT = 1000;
|
||||
|
||||
@SneakyThrows @Inject
|
||||
@Inject
|
||||
public SentenceExtractor(LanguageModels models)
|
||||
{
|
||||
try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) {
|
||||
|
@ -1,6 +1,9 @@
|
||||
package nu.marginalia.actor;
|
||||
|
||||
import nu.marginalia.actor.prototype.ActorPrototype;
|
||||
import nu.marginalia.actor.state.ActorResumeBehavior;
|
||||
import nu.marginalia.actor.state.ActorStateInstance;
|
||||
import nu.marginalia.actor.state.ActorStateTransition;
|
||||
import nu.marginalia.mq.MessageQueueFactory;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
@ -8,7 +11,6 @@ import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSubscription;
|
||||
import nu.marginalia.mq.inbox.MqSynchronousInbox;
|
||||
import nu.marginalia.mq.outbox.MqOutbox;
|
||||
import nu.marginalia.actor.state.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -305,33 +307,38 @@ public class ActorStateMachine {
|
||||
return state;
|
||||
}
|
||||
|
||||
public void abortExecution() throws Exception {
|
||||
// Create a fake message to abort the execution
|
||||
// This helps make sense of the queue when debugging
|
||||
// and also permits the real termination message to have an
|
||||
// unique expected ID
|
||||
public void abortExecution() {
|
||||
try {
|
||||
// Create a fake message to abort the execution
|
||||
// This helps make sense of the queue when debugging
|
||||
// and also permits the real termination message to have an
|
||||
// unique expected ID
|
||||
|
||||
long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution");
|
||||
long abortMsgId = smOutbox.sendNotice(expectedMessage.id, "ABORT", "Aborting execution");
|
||||
|
||||
// Set it as dead to clean up the queue from mystery ACK messages
|
||||
smOutbox.flagAsDead(abortMsgId);
|
||||
// Set it as dead to clean up the queue from mystery ACK messages
|
||||
smOutbox.flagAsDead(abortMsgId);
|
||||
|
||||
// Set the expected message to the abort message,
|
||||
// technically there's a slight chance of a race condition here,
|
||||
// which will cause this message to be ERR'd and the process to
|
||||
// continue, but it's very unlikely and the worst that can happen
|
||||
// is you have to abort twice.
|
||||
// Set the expected message to the abort message,
|
||||
// technically there's a slight chance of a race condition here,
|
||||
// which will cause this message to be ERR'd and the process to
|
||||
// continue, but it's very unlikely and the worst that can happen
|
||||
// is you have to abort twice.
|
||||
|
||||
expectedMessage = ExpectedMessage.expectId(abortMsgId);
|
||||
expectedMessage = ExpectedMessage.expectId(abortMsgId);
|
||||
|
||||
// Add a state transition to the monitor state, causing it to reset the state machine to the initial state
|
||||
// (or if no monitor state is defined, set it to the final state)
|
||||
smOutbox.sendNotice(abortMsgId, finalState.name(), "");
|
||||
// Add a state transition to the monitor state, causing it to reset the state machine to the initial state
|
||||
// (or if no monitor state is defined, set it to the final state)
|
||||
smOutbox.sendNotice(abortMsgId, finalState.name(), "");
|
||||
|
||||
// Dislodge the current task with an interrupt.
|
||||
// It's actually fine if we accidentally interrupt the wrong thread
|
||||
// (i.e. the abort task), since it shouldn't be doing anything interruptable
|
||||
smInbox.abortCurrentTask();
|
||||
// Dislodge the current task with an interrupt.
|
||||
// It's actually fine if we accidentally interrupt the wrong thread
|
||||
// (i.e. the abort task), since it shouldn't be doing anything interruptable
|
||||
smInbox.abortCurrentTask();
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to abort execution", e);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if there is an INITIAL state that requires no parameters */
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.mq.inbox;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.MqMessageState;
|
||||
import nu.marginalia.mq.persistence.MqMessageHandlerRegistry;
|
||||
@ -67,8 +66,7 @@ public class MqSingleShotInbox {
|
||||
* @param predicate A predicate that must be true for the message to be stolen
|
||||
* @return The stolen message, or empty if no message was stolen
|
||||
*/
|
||||
@SneakyThrows
|
||||
public Optional<MqMessage> stealMessage(Predicate<MqMessage> predicate) {
|
||||
public Optional<MqMessage> stealMessage(Predicate<MqMessage> predicate) throws SQLException {
|
||||
for (var message : persistence.eavesdrop(inboxName, 5)) {
|
||||
if (predicate.test(message)) {
|
||||
persistence.changeOwner(message.msgId(), instanceUUID, -1);
|
||||
|
@ -1,8 +1,9 @@
|
||||
package nu.marginalia.term_frequency_dict;
|
||||
|
||||
import ca.rmen.porterstemmer.PorterStemmer;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
@ -11,9 +12,7 @@ import org.jetbrains.annotations.NotNull;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.io.*;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@ -29,13 +28,11 @@ public class TermFrequencyDict {
|
||||
public static final long DOC_COUNT_KEY = ~0L;
|
||||
|
||||
@Inject
|
||||
public TermFrequencyDict(@NotNull LanguageModels models) {
|
||||
public TermFrequencyDict(@NotNull LanguageModels models) throws IOException {
|
||||
this(models.termFrequencies);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public TermFrequencyDict(Path file) {
|
||||
|
||||
public TermFrequencyDict(Path file) throws IOException {
|
||||
wordRates = load(file);
|
||||
logger.info("Read {} N-grams frequencies", wordRates.size());
|
||||
}
|
||||
|
@ -31,8 +31,13 @@ public class DocumentKeywordExtractor {
|
||||
|
||||
// for tests
|
||||
public DocumentKeywordExtractor() {
|
||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
try {
|
||||
this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels());
|
||||
this.keywordExtractor = new KeywordExtractor();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.Builder;
|
||||
import nu.marginalia.keyword.extractors.NameLikeKeywords;
|
||||
import nu.marginalia.keyword.extractors.SubjectLikeKeywords;
|
||||
import nu.marginalia.keyword.extractors.TitleKeywords;
|
||||
@ -14,19 +13,21 @@ class KeywordMetadata {
|
||||
private final SubjectLikeKeywords subjectLikeKeywords;
|
||||
private final UrlKeywords urlKeywords;
|
||||
|
||||
@Builder
|
||||
public KeywordMetadata(
|
||||
TitleKeywords titleKeywords,
|
||||
NameLikeKeywords nameLikeKeywords,
|
||||
SubjectLikeKeywords subjectLikeKeywords,
|
||||
UrlKeywords urlKeywords)
|
||||
{
|
||||
UrlKeywords urlKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
this.urlKeywords = urlKeywords;
|
||||
}
|
||||
|
||||
public static KeywordMetadataBuilder builder() {
|
||||
return new KeywordMetadataBuilder();
|
||||
}
|
||||
|
||||
public byte getMetadataForWord(String stemmed) {
|
||||
|
||||
byte flags = 0;
|
||||
@ -54,4 +55,41 @@ class KeywordMetadata {
|
||||
return flags;
|
||||
}
|
||||
|
||||
public static class KeywordMetadataBuilder {
|
||||
private TitleKeywords titleKeywords;
|
||||
private NameLikeKeywords nameLikeKeywords;
|
||||
private SubjectLikeKeywords subjectLikeKeywords;
|
||||
private UrlKeywords urlKeywords;
|
||||
|
||||
KeywordMetadataBuilder() {
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder titleKeywords(TitleKeywords titleKeywords) {
|
||||
this.titleKeywords = titleKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder nameLikeKeywords(NameLikeKeywords nameLikeKeywords) {
|
||||
this.nameLikeKeywords = nameLikeKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder subjectLikeKeywords(SubjectLikeKeywords subjectLikeKeywords) {
|
||||
this.subjectLikeKeywords = subjectLikeKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadataBuilder urlKeywords(UrlKeywords urlKeywords) {
|
||||
this.urlKeywords = urlKeywords;
|
||||
return this;
|
||||
}
|
||||
|
||||
public KeywordMetadata build() {
|
||||
return new KeywordMetadata(this.titleKeywords, this.nameLikeKeywords, this.subjectLikeKeywords, this.urlKeywords);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "KeywordMetadata.KeywordMetadataBuilder(titleKeywords=" + this.titleKeywords + ", nameLikeKeywords=" + this.nameLikeKeywords + ", subjectLikeKeywords=" + this.subjectLikeKeywords + ", urlKeywords=" + this.urlKeywords + ")";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -4,7 +4,6 @@ import gnu.trove.list.array.TByteArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap;
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.model.idx.CodedWordSpan;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
@ -15,13 +14,14 @@ import org.slf4j.LoggerFactory;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.util.*;
|
||||
|
||||
@Getter
|
||||
public class DocumentKeywordsBuilder {
|
||||
public final Object2ByteOpenHashMap<String> wordToMeta;
|
||||
public final HashMap<String, IntList> wordToPos;
|
||||
public final Map<Character, List<DocumentWordSpan>> wordSpans = new HashMap<>();
|
||||
|
||||
/** These ware keywords that had signals of high relevance */
|
||||
/**
|
||||
* These ware keywords that had signals of high relevance
|
||||
*/
|
||||
public final Set<String> importantWords = new HashSet<>();
|
||||
|
||||
// |------64 letters is this long-------------------------------|
|
||||
@ -64,7 +64,7 @@ public class DocumentKeywordsBuilder {
|
||||
wordSpans.forEach((tag, spansForTag) -> {
|
||||
spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start));
|
||||
|
||||
var positionsForTag = new IntArrayList(spansForTag.size()*2);
|
||||
var positionsForTag = new IntArrayList(spansForTag.size() * 2);
|
||||
for (var span : spansForTag) {
|
||||
positionsForTag.add(span.start());
|
||||
positionsForTag.add(span.end());
|
||||
@ -77,7 +77,7 @@ public class DocumentKeywordsBuilder {
|
||||
}
|
||||
|
||||
public DocumentKeywordsBuilder(int capacity) {
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToMeta = new Object2ByteOpenHashMap<>(capacity);
|
||||
wordToPos = new HashMap<>(capacity);
|
||||
}
|
||||
|
||||
@ -101,7 +101,7 @@ public class DocumentKeywordsBuilder {
|
||||
|
||||
public void setFlagOnMetadataForWords(WordFlags flag, Collection<String> flagWords) {
|
||||
flagWords.forEach(word ->
|
||||
wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b))
|
||||
wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte) (a | b))
|
||||
);
|
||||
}
|
||||
|
||||
@ -116,7 +116,7 @@ public class DocumentKeywordsBuilder {
|
||||
public List<String> getWordsWithAnyFlag(long flags) {
|
||||
List<String> ret = new ArrayList<>();
|
||||
|
||||
for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) {
|
||||
for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext(); ) {
|
||||
var entry = iter.next();
|
||||
if ((flags & entry.getByteValue()) != 0) {
|
||||
ret.add(entry.getKey());
|
||||
@ -159,6 +159,30 @@ public class DocumentKeywordsBuilder {
|
||||
return sb.append(']').toString();
|
||||
}
|
||||
|
||||
public Object2ByteOpenHashMap<String> getWordToMeta() {
|
||||
return this.wordToMeta;
|
||||
}
|
||||
|
||||
public HashMap<String, IntList> getWordToPos() {
|
||||
return this.wordToPos;
|
||||
}
|
||||
|
||||
public Map<Character, List<DocumentWordSpan>> getWordSpans() {
|
||||
return this.wordSpans;
|
||||
}
|
||||
|
||||
public Set<String> getImportantWords() {
|
||||
return this.importantWords;
|
||||
}
|
||||
|
||||
public int getMAX_WORD_LENGTH() {
|
||||
return this.MAX_WORD_LENGTH;
|
||||
}
|
||||
|
||||
public int getMAX_POSITIONS_PER_WORD() {
|
||||
return this.MAX_POSITIONS_PER_WORD;
|
||||
}
|
||||
|
||||
public record DocumentWordSpan(HtmlTag tag, int start, int end) {
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.keyword;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.language.sentence.SentenceExtractor;
|
||||
@ -13,6 +12,7 @@ import org.junit.jupiter.api.Tag;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.util.*;
|
||||
import java.util.regex.Pattern;
|
||||
@ -25,8 +25,7 @@ class SentenceExtractorTest {
|
||||
|
||||
static SentenceExtractor se = new SentenceExtractor(lm);
|
||||
|
||||
@SneakyThrows
|
||||
public static void main(String... args) throws IOException {
|
||||
public static void main(String... args) throws IOException, URISyntaxException {
|
||||
final LanguageModels lm = TestLanguageModels.getLanguageModels();
|
||||
|
||||
var data = WmsaHome.getHomePath().resolve("test-data/");
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.keyword.extractors;
|
||||
|
||||
import com.google.common.collect.Sets;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.converting.processor.logic.dom.DomPruningFilter;
|
||||
import nu.marginalia.keyword.KeywordExtractor;
|
||||
@ -10,13 +9,14 @@ import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.jsoup.Jsoup;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.Collections;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
class NameLikeKeywordsTest {
|
||||
String text = """
|
||||
@ -58,8 +58,7 @@ class NameLikeKeywordsTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testWikiArticle() {
|
||||
public void testWikiArticle() throws IOException {
|
||||
var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/java.html"),
|
||||
"Could not load word frequency table");
|
||||
String html = new String(resource.readAllBytes(), Charset.defaultCharset());
|
||||
@ -75,7 +74,6 @@ class NameLikeKeywordsTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
@SneakyThrows
|
||||
public void testWikiArticleP1() {
|
||||
String html = """
|
||||
<p><b>Java</b> is a high-level, class-based, object-oriented programming language that is designed to have as few implementation dependencies as possible. It is a general-purpose programming language intended to let programmers <i>write once, run anywhere</i> (WORA), meaning that compiled Java code can run on all platforms that support Java without the need to recompile. Java applications are typically compiled to bytecode that can run on any Java virtual machine (JVM) regardless of the underlying computer architecture. The syntax of Java is similar to C and C++, but has fewer low-level facilities than either of them. The Java runtime provides dynamic capabilities (such as reflection and runtime code modification) that are typically not available in traditional compiled languages. As of 2019 , Java was one of the most popular programming languages in use according to GitHub, particularly for client–server web applications, with a reported 9 million developers.</p>
|
||||
|
@ -7,6 +7,7 @@ import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import nu.marginalia.test.util.TestLanguageModels;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
@ -41,7 +42,7 @@ class SubjectLikeKeywordsTest {
|
||||
""";
|
||||
|
||||
@Test
|
||||
public void test() {
|
||||
public void test() throws IOException {
|
||||
var lm = TestLanguageModels.getLanguageModels();
|
||||
var dict = new TermFrequencyDict(lm);
|
||||
|
||||
|
@ -363,7 +363,7 @@ public class ConverterMain extends ProcessMainClass {
|
||||
};
|
||||
}
|
||||
catch (Exception ex) {
|
||||
inbox.sendResponse(msg, MqInboxResponse.err(STR."\{ex.getClass().getSimpleName()}: \{ex.getMessage()}"));
|
||||
inbox.sendResponse(msg, MqInboxResponse.err(ex.getClass().getSimpleName() + ": " + ex.getMessage()));
|
||||
|
||||
throw ex;
|
||||
}
|
||||
|
@ -1,15 +1,12 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.Getter;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.keyword.model.DocumentKeywordsBuilder;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.UrlIndexingState;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.OptionalDouble;
|
||||
|
||||
@ToString @Getter
|
||||
public class ProcessedDocument {
|
||||
public EdgeUrl url;
|
||||
|
||||
@ -41,4 +38,30 @@ public class ProcessedDocument {
|
||||
}
|
||||
return OptionalDouble.empty();
|
||||
}
|
||||
|
||||
public EdgeUrl getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public ProcessedDocumentDetails getDetails() {
|
||||
return this.details;
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public DocumentKeywordsBuilder getWords() {
|
||||
return this.words;
|
||||
}
|
||||
|
||||
public UrlIndexingState getState() {
|
||||
return this.state;
|
||||
}
|
||||
|
||||
public String getStateReason() {
|
||||
return this.stateReason;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDocument(url=" + this.getUrl() + ", details=" + this.getDetails() + ", words=" + this.getWords() + ", state=" + this.getState() + ", stateReason=" + this.getStateReason() + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,16 +1,14 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.html.HtmlStandard;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
@ToString
|
||||
public class ProcessedDocumentDetails {
|
||||
public String title;
|
||||
public String description;
|
||||
@ -31,4 +29,8 @@ public class ProcessedDocumentDetails {
|
||||
|
||||
public DocumentMetadata metadata;
|
||||
public GeneratorType generator;
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDocumentDetails(title=" + this.title + ", description=" + this.description + ", pubYear=" + this.pubYear + ", length=" + this.length + ", quality=" + this.quality + ", hashCode=" + this.hashCode + ", features=" + this.features + ", standard=" + this.standard + ", linksInternal=" + this.linksInternal + ", linksExternal=" + this.linksExternal + ", feedLinks=" + this.feedLinks + ", metadata=" + this.metadata + ", generator=" + this.generator + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.model;
|
||||
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWritableIf;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
@ -11,8 +10,7 @@ import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
|
||||
@ToString
|
||||
public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
public EdgeDomain domain;
|
||||
|
||||
public List<ProcessedDocument> documents;
|
||||
@ -21,8 +19,10 @@ public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
public String ip;
|
||||
|
||||
|
||||
/** Used by the sideloader to give advice on how many documents are crawled
|
||||
* without actually having to count (which would take forever) */
|
||||
/**
|
||||
* Used by the sideloader to give advice on how many documents are crawled
|
||||
* without actually having to count (which would take forever)
|
||||
*/
|
||||
@Nullable
|
||||
public Integer sizeloadSizeAdvice;
|
||||
|
||||
@ -41,5 +41,10 @@ public class ProcessedDomain implements ConverterBatchWritableIf {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
public void close() {
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "ProcessedDomain(domain=" + this.domain + ", documents=" + this.documents + ", state=" + this.state + ", redirect=" + this.redirect + ", ip=" + this.ip + ", sizeloadSizeAdvice=" + this.sizeloadSizeAdvice + ")";
|
||||
}
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
package nu.marginalia.converting.processor;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSource;
|
||||
@ -164,57 +163,62 @@ public class DomainProcessor {
|
||||
}
|
||||
|
||||
|
||||
@SneakyThrows
|
||||
@Nullable
|
||||
public ProcessedDomain fullProcessing(SerializableCrawlDataStream dataStream) {
|
||||
if (!dataStream.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
try {
|
||||
if (!dataStream.hasNext()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
Set<String> processedUrls = new HashSet<>();
|
||||
List<ProcessedDocument> docs = new ArrayList<>();
|
||||
Set<String> processedUrls = new HashSet<>();
|
||||
|
||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
|
||||
}
|
||||
if (!(dataStream.next() instanceof CrawledDomain crawledDomain)) {
|
||||
throw new IllegalStateException("First record must be a domain, was " + dataStream.next().getClass().getSimpleName());
|
||||
}
|
||||
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
|
||||
DocumentDecorator documentDecorator = new DocumentDecorator();
|
||||
DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain());
|
||||
DocumentDecorator documentDecorator = new DocumentDecorator();
|
||||
|
||||
// Process Domain Record
|
||||
// Process Domain Record
|
||||
|
||||
ProcessedDomain ret = new ProcessedDomain();
|
||||
processDomain(crawledDomain, ret, documentDecorator);
|
||||
ret.documents = docs;
|
||||
ProcessedDomain ret = new ProcessedDomain();
|
||||
processDomain(crawledDomain, ret, documentDecorator);
|
||||
ret.documents = docs;
|
||||
|
||||
// Process Documents
|
||||
// Process Documents
|
||||
|
||||
try (var deduplicator = new LshDocumentDeduplicator()) {
|
||||
while (dataStream.hasNext()) {
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (doc.url == null)
|
||||
continue;
|
||||
if (doc.documentBody.isBlank())
|
||||
continue;
|
||||
if (!processedUrls.add(doc.url))
|
||||
continue;
|
||||
try (var deduplicator = new LshDocumentDeduplicator()) {
|
||||
while (dataStream.hasNext()) {
|
||||
if (!(dataStream.next() instanceof CrawledDocument doc))
|
||||
continue;
|
||||
if (doc.url == null)
|
||||
continue;
|
||||
if (doc.documentBody.isBlank())
|
||||
continue;
|
||||
if (!processedUrls.add(doc.url))
|
||||
continue;
|
||||
|
||||
try {
|
||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
docs.add(processedDoc);
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
try {
|
||||
var processedDoc = documentProcessor.process(doc, ret.domain, externalDomainLinks, documentDecorator);
|
||||
deduplicator.markIfDuplicate(processedDoc);
|
||||
docs.add(processedDoc);
|
||||
} catch (Exception ex) {
|
||||
logger.warn("Failed to process " + doc.url, ex);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
calculateStatistics(ret, externalDomainLinks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Failed to process domain", ex);
|
||||
return null;
|
||||
}
|
||||
|
||||
// Add late keywords and features from domain-level information
|
||||
|
||||
calculateStatistics(ret, externalDomainLinks);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private void processDomain(CrawledDomain crawledDomain,
|
||||
|
@ -1,13 +1,11 @@
|
||||
package nu.marginalia.converting.processor.logic.links;
|
||||
|
||||
import lombok.Getter;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
|
||||
import java.util.*;
|
||||
import java.util.function.BiConsumer;
|
||||
|
||||
@Getter
|
||||
public class LinkGraph {
|
||||
private final Map<EdgeUrl, Set<EdgeUrl>> graph = new HashMap<>(1000);
|
||||
|
||||
|
@ -4,7 +4,6 @@ import com.google.gson.Gson;
|
||||
import com.google.gson.GsonBuilder;
|
||||
import com.google.gson.JsonSyntaxException;
|
||||
import com.google.gson.annotations.SerializedName;
|
||||
import lombok.ToString;
|
||||
import nu.marginalia.converting.model.DocumentHeaders;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel;
|
||||
import nu.marginalia.converting.processor.pubdate.PubDateHeuristic;
|
||||
@ -77,7 +76,6 @@ class JsonModel {
|
||||
List<JsonModelGraphItem> graph;
|
||||
}
|
||||
|
||||
@ToString
|
||||
class JsonModelGraphItem {
|
||||
@SerializedName("@type")
|
||||
public String type;
|
||||
@ -88,5 +86,9 @@ class JsonModelGraphItem {
|
||||
return "NewsArticle".equalsIgnoreCase(type)
|
||||
|| "Article".equalsIgnoreCase(type);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "JsonModelGraphItem(type=" + this.type + ", datePublished=" + this.datePublished + ")";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,20 +1,62 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor
|
||||
@NoArgsConstructor
|
||||
@Setter
|
||||
@Getter
|
||||
class DirtreeSideloadSpec {
|
||||
public String name;
|
||||
public String domainName;
|
||||
public String dir;
|
||||
public String baseUrl;
|
||||
public List<String> keywords;
|
||||
|
||||
public DirtreeSideloadSpec(String name, String domainName, String dir, String baseUrl, List<String> keywords) {
|
||||
this.name = name;
|
||||
this.domainName = domainName;
|
||||
this.dir = dir;
|
||||
this.baseUrl = baseUrl;
|
||||
this.keywords = keywords;
|
||||
}
|
||||
|
||||
public DirtreeSideloadSpec() {
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return this.name;
|
||||
}
|
||||
|
||||
public String getDomainName() {
|
||||
return this.domainName;
|
||||
}
|
||||
|
||||
public String getDir() {
|
||||
return this.dir;
|
||||
}
|
||||
|
||||
public String getBaseUrl() {
|
||||
return this.baseUrl;
|
||||
}
|
||||
|
||||
public List<String> getKeywords() {
|
||||
return this.keywords;
|
||||
}
|
||||
|
||||
public void setName(String name) {
|
||||
this.name = name;
|
||||
}
|
||||
|
||||
public void setDomainName(String domainName) {
|
||||
this.domainName = domainName;
|
||||
}
|
||||
|
||||
public void setDir(String dir) {
|
||||
this.dir = dir;
|
||||
}
|
||||
|
||||
public void setBaseUrl(String baseUrl) {
|
||||
this.baseUrl = baseUrl;
|
||||
}
|
||||
|
||||
public void setKeywords(List<String> keywords) {
|
||||
this.keywords = keywords;
|
||||
}
|
||||
}
|
||||
|
@ -1,14 +1,22 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Getter;
|
||||
import lombok.NoArgsConstructor;
|
||||
import lombok.Setter;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
@AllArgsConstructor @NoArgsConstructor
|
||||
@Setter @Getter
|
||||
class DirtreeSideloadSpecList {
|
||||
public List<DirtreeSideloadSpec> sources;
|
||||
|
||||
public DirtreeSideloadSpecList(List<DirtreeSideloadSpec> sources) {
|
||||
this.sources = sources;
|
||||
}
|
||||
|
||||
public DirtreeSideloadSpecList() {
|
||||
}
|
||||
|
||||
public List<DirtreeSideloadSpec> getSources() {
|
||||
return this.sources;
|
||||
}
|
||||
|
||||
public void setSources(List<DirtreeSideloadSpec> sources) {
|
||||
this.sources = sources;
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.sideload.dirtree;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.converting.model.GeneratorType;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
@ -13,6 +12,7 @@ import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.LocalDate;
|
||||
@ -72,24 +72,28 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable {
|
||||
return name.endsWith(".html") || name.endsWith(".htm");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
private ProcessedDocument process(Path path) {
|
||||
String body = Files.readString(path);
|
||||
String url = urlBase + dirBase.relativize(path);
|
||||
try {
|
||||
String body = Files.readString(path);
|
||||
String url = urlBase + dirBase.relativize(path);
|
||||
|
||||
// We trim "/index.html"-suffixes from the index if they are present,
|
||||
// since this is typically an artifact from document retrieval
|
||||
if (url.endsWith("/index.html")) {
|
||||
url = url.substring(0, url.length() - "index.html".length());
|
||||
// We trim "/index.html"-suffixes from the index if they are present,
|
||||
// since this is typically an artifact from document retrieval
|
||||
if (url.endsWith("/index.html")) {
|
||||
url = url.substring(0, url.length() - "index.html".length());
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.NORMAL,
|
||||
new LinkTexts(),
|
||||
LocalDate.now().getYear(),
|
||||
10_000);
|
||||
}
|
||||
catch (IOException | URISyntaxException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return sideloaderProcessing
|
||||
.processDocument(url, body, extraKeywords, new DomainLinks(),
|
||||
GeneratorType.DOCS,
|
||||
DocumentClass.NORMAL,
|
||||
new LinkTexts(),
|
||||
LocalDate.now().getYear(),
|
||||
10_000);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -3,7 +3,6 @@ package nu.marginalia.converting.sideload.encyclopedia;
|
||||
import com.github.luben.zstd.ZstdInputStream;
|
||||
import com.google.common.base.Charsets;
|
||||
import com.google.gson.Gson;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.atags.AnchorTextKeywords;
|
||||
import nu.marginalia.atags.model.DomainLinks;
|
||||
import nu.marginalia.atags.source.AnchorTagsSourceFactory;
|
||||
@ -78,7 +77,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC
|
||||
return ret;
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
@Override
|
||||
public Iterator<ProcessedDocument> getDocumentsStream() {
|
||||
// This leaks a thread pool, but it doesn't matter since this is a one-off process
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user