mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
Merge pull request #127 from MarginaliaSearch/serp-redesign
Web UI redesign
This commit is contained in:
commit
be6382e0d0
1
.gitignore
vendored
1
.gitignore
vendored
@ -7,3 +7,4 @@ build/
|
||||
lombok.config
|
||||
Dockerfile
|
||||
run
|
||||
jte-classes
|
@ -48,6 +48,7 @@ ext {
|
||||
dockerImageTag='latest'
|
||||
dockerImageRegistry='marginalia'
|
||||
jibVersion = '3.4.3'
|
||||
|
||||
}
|
||||
|
||||
idea {
|
||||
|
@ -28,7 +28,7 @@ public class DbDomainQueries {
|
||||
}
|
||||
|
||||
|
||||
public Integer getDomainId(EdgeDomain domain) {
|
||||
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
|
||||
try (var connection = dataSource.getConnection()) {
|
||||
|
||||
return domainIdCache.get(domain, () -> {
|
||||
@ -42,6 +42,9 @@ public class DbDomainQueries {
|
||||
throw new NoSuchElementException();
|
||||
});
|
||||
}
|
||||
catch (UncheckedExecutionException ex) {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
catch (ExecutionException ex) {
|
||||
throw new RuntimeException(ex.getCause());
|
||||
}
|
||||
|
@ -42,6 +42,12 @@ dependencies {
|
||||
implementation libs.bundles.curator
|
||||
implementation libs.bundles.flyway
|
||||
|
||||
libs.bundles.jooby.get().each {
|
||||
implementation dependencies.create(it) {
|
||||
exclude group: 'org.slf4j'
|
||||
}
|
||||
}
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
implementation libs.bundles.mariadb
|
||||
|
||||
|
@ -0,0 +1,178 @@
|
||||
package nu.marginalia.service.server;
|
||||
|
||||
import io.jooby.*;
|
||||
import io.prometheus.client.Counter;
|
||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||
import nu.marginalia.service.client.ServiceNotAvailableException;
|
||||
import nu.marginalia.service.discovery.property.ServiceEndpoint;
|
||||
import nu.marginalia.service.discovery.property.ServiceKey;
|
||||
import nu.marginalia.service.discovery.property.ServicePartition;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.server.jte.JteModule;
|
||||
import nu.marginalia.service.server.mq.ServiceMqSubscription;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.List;
|
||||
|
||||
public class JoobyService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
|
||||
|
||||
private final Initialization initialization;
|
||||
|
||||
private final static Counter request_counter = Counter.build("wmsa_request_counter", "Request Counter")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_good = Counter.build("wmsa_request_counter_good", "Good Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_bad = Counter.build("wmsa_request_counter_bad", "Bad Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final static Counter request_counter_err = Counter.build("wmsa_request_counter_err", "Error Requests")
|
||||
.labelNames("service", "node")
|
||||
.register();
|
||||
private final String serviceName;
|
||||
private static volatile boolean initialized = false;
|
||||
|
||||
protected final MqInboxIf messageQueueInbox;
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
private ServiceConfiguration config;
|
||||
private final List<MvcExtension> joobyServices;
|
||||
private final ServiceEndpoint restEndpoint;
|
||||
|
||||
public JoobyService(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices,
|
||||
List<MvcExtension> joobyServices
|
||||
) throws Exception {
|
||||
|
||||
this.joobyServices = joobyServices;
|
||||
this.initialization = params.initialization;
|
||||
config = params.configuration;
|
||||
node = config.node();
|
||||
|
||||
String inboxName = config.serviceName();
|
||||
logger.info("Inbox name: {}", inboxName);
|
||||
|
||||
var serviceRegistry = params.serviceRegistry;
|
||||
|
||||
restEndpoint = serviceRegistry.registerService(ServiceKey.forRest(config.serviceId(), config.node()),
|
||||
config.instanceUuid(), config.externalAddress());
|
||||
|
||||
var mqInboxFactory = params.messageQueueInboxFactory;
|
||||
messageQueueInbox = mqInboxFactory.createSynchronousInbox(inboxName, config.node(), config.instanceUuid());
|
||||
messageQueueInbox.subscribe(new ServiceMqSubscription(this));
|
||||
|
||||
serviceName = System.getProperty("service-name");
|
||||
|
||||
initialization.addCallback(params.heartbeat::start);
|
||||
initialization.addCallback(messageQueueInbox::start);
|
||||
initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", serviceName + ":" + config.node()));
|
||||
initialization.addCallback(() -> serviceRegistry.announceInstance(config.instanceUuid()));
|
||||
|
||||
Thread.setDefaultUncaughtExceptionHandler((t, e) -> {
|
||||
if (e instanceof ServiceNotAvailableException) {
|
||||
// reduce log spam for this common case
|
||||
logger.error("Service not available: {}", e.getMessage());
|
||||
}
|
||||
else {
|
||||
logger.error("Uncaught exception", e);
|
||||
}
|
||||
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
|
||||
});
|
||||
|
||||
if (!initialization.isReady() && ! initialized ) {
|
||||
initialized = true;
|
||||
grpcServer = new GrpcServer(config, serviceRegistry, partition, grpcServices);
|
||||
grpcServer.start();
|
||||
}
|
||||
}
|
||||
|
||||
public void startJooby(Jooby jooby) {
|
||||
|
||||
logger.info("{} Listening to {}:{} ({})", getClass().getSimpleName(),
|
||||
restEndpoint.host(),
|
||||
restEndpoint.port(),
|
||||
config.externalAddress());
|
||||
|
||||
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
|
||||
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
|
||||
jooby.assets("/*", Paths.get("/app/resources/static"));
|
||||
|
||||
var options = new ServerOptions();
|
||||
options.setHost(config.bindAddress());
|
||||
options.setPort(restEndpoint.port());
|
||||
|
||||
// Enable gzip compression of response data, but set compression to the lowest level
|
||||
// since it doesn't really save much more space to dial it up. It's typically a
|
||||
// single digit percentage difference since HTML already compresses very well with level = 1.
|
||||
options.setCompressionLevel(1);
|
||||
|
||||
|
||||
jooby.setServerOptions(options);
|
||||
|
||||
jooby.get("/internal/ping", ctx -> "pong");
|
||||
jooby.get("/internal/started", this::isInitialized);
|
||||
jooby.get("/internal/ready", this::isReady);
|
||||
|
||||
for (var service : joobyServices) {
|
||||
jooby.mvc(service);
|
||||
}
|
||||
|
||||
jooby.before(this::auditRequestIn);
|
||||
jooby.after(this::auditRequestOut);
|
||||
}
|
||||
|
||||
private Object isInitialized(Context ctx) {
|
||||
if (initialization.isReady()) {
|
||||
return "ok";
|
||||
}
|
||||
else {
|
||||
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
|
||||
return "bad";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isReady() {
|
||||
return true;
|
||||
}
|
||||
|
||||
private String isReady(Context ctx) {
|
||||
if (isReady()) {
|
||||
return "ok";
|
||||
}
|
||||
else {
|
||||
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
|
||||
return "bad";
|
||||
}
|
||||
}
|
||||
|
||||
private void auditRequestIn(Context ctx) {
|
||||
request_counter.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
|
||||
private void auditRequestOut(Context ctx, Object result, Throwable failure) {
|
||||
if (ctx.getResponseCode().value() < 400) {
|
||||
request_counter_good.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
else {
|
||||
request_counter_bad.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
|
||||
if (failure != null) {
|
||||
logger.error("Request failed " + ctx.getMethod() + " " + ctx.getRequestURL(), failure);
|
||||
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -16,7 +16,7 @@ import spark.Spark;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class Service {
|
||||
public class SparkService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
@ -43,7 +43,7 @@ public class Service {
|
||||
private final int node;
|
||||
private GrpcServer grpcServer;
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
public SparkService(BaseServiceParams params,
|
||||
Runnable configureStaticFiles,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
@ -126,18 +126,18 @@ public class Service {
|
||||
}
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params,
|
||||
public SparkService(BaseServiceParams params,
|
||||
ServicePartition partition,
|
||||
List<DiscoverableService> grpcServices) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
SparkService::defaultSparkConfig,
|
||||
partition,
|
||||
grpcServices);
|
||||
}
|
||||
|
||||
public Service(BaseServiceParams params) throws Exception {
|
||||
public SparkService(BaseServiceParams params) throws Exception {
|
||||
this(params,
|
||||
Service::defaultSparkConfig,
|
||||
SparkService::defaultSparkConfig,
|
||||
ServicePartition.any(),
|
||||
List.of());
|
||||
}
|
@ -0,0 +1,61 @@
|
||||
package nu.marginalia.service.server.jte;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.NonNull;
|
||||
import edu.umd.cs.findbugs.annotations.Nullable;
|
||||
import gg.jte.ContentType;
|
||||
import gg.jte.TemplateEngine;
|
||||
import gg.jte.resolve.DirectoryCodeResolver;
|
||||
import io.jooby.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.nio.file.Path;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
// Temporary workaround for a bug
|
||||
// APL-2.0 https://github.com/jooby-project/jooby
|
||||
public class JteModule implements Extension {
|
||||
private Path sourceDirectory;
|
||||
private Path classDirectory;
|
||||
private TemplateEngine templateEngine;
|
||||
|
||||
public JteModule(@NonNull Path sourceDirectory, @NonNull Path classDirectory) {
|
||||
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
this.classDirectory = (Path)Objects.requireNonNull(classDirectory, "Class directory is required.");
|
||||
}
|
||||
|
||||
public JteModule(@NonNull Path sourceDirectory) {
|
||||
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
}
|
||||
|
||||
public JteModule(@NonNull TemplateEngine templateEngine) {
|
||||
this.templateEngine = (TemplateEngine)Objects.requireNonNull(templateEngine, "Template engine is required.");
|
||||
}
|
||||
|
||||
public void install(@NonNull Jooby application) {
|
||||
if (this.templateEngine == null) {
|
||||
this.templateEngine = create(application.getEnvironment(), this.sourceDirectory, this.classDirectory);
|
||||
}
|
||||
|
||||
ServiceRegistry services = application.getServices();
|
||||
services.put(TemplateEngine.class, this.templateEngine);
|
||||
application.encoder(MediaType.html, new JteTemplateEngine(this.templateEngine));
|
||||
}
|
||||
|
||||
public static TemplateEngine create(@NonNull Environment environment, @NonNull Path sourceDirectory, @Nullable Path classDirectory) {
|
||||
boolean dev = environment.isActive("dev", new String[]{"test"});
|
||||
if (dev) {
|
||||
Objects.requireNonNull(sourceDirectory, "Source directory is required.");
|
||||
Path requiredClassDirectory = (Path)Optional.ofNullable(classDirectory).orElseGet(() -> sourceDirectory.resolve("jte-classes"));
|
||||
TemplateEngine engine = TemplateEngine.create(new DirectoryCodeResolver(sourceDirectory), requiredClassDirectory, ContentType.Html, environment.getClassLoader());
|
||||
Optional<List<String>> var10000 = Optional.ofNullable(System.getProperty("jooby.run.classpath")).map((it) -> it.split(File.pathSeparator)).map(Stream::of).map(Stream::toList);
|
||||
Objects.requireNonNull(engine);
|
||||
var10000.ifPresent(engine::setClassPath);
|
||||
return engine;
|
||||
} else {
|
||||
return classDirectory == null ? TemplateEngine.createPrecompiled(ContentType.Html) : TemplateEngine.createPrecompiled(classDirectory, ContentType.Html);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
package nu.marginalia.service.server.jte;
|
||||
|
||||
import edu.umd.cs.findbugs.annotations.NonNull;
|
||||
import gg.jte.TemplateEngine;
|
||||
import io.jooby.Context;
|
||||
import io.jooby.MapModelAndView;
|
||||
import io.jooby.ModelAndView;
|
||||
import io.jooby.buffer.DataBuffer;
|
||||
import io.jooby.internal.jte.DataBufferOutput;
|
||||
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
// Temporary workaround for a bug
|
||||
// APL-2.0 https://github.com/jooby-project/jooby
|
||||
class JteTemplateEngine implements io.jooby.TemplateEngine {
|
||||
private final TemplateEngine jte;
|
||||
private final List<String> extensions;
|
||||
|
||||
public JteTemplateEngine(TemplateEngine jte) {
|
||||
this.jte = jte;
|
||||
this.extensions = List.of(".jte", ".kte");
|
||||
}
|
||||
|
||||
|
||||
@NonNull @Override
|
||||
public List<String> extensions() {
|
||||
return extensions;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataBuffer render(Context ctx, ModelAndView modelAndView) {
|
||||
var buffer = ctx.getBufferFactory().allocateBuffer();
|
||||
var output = new DataBufferOutput(buffer, StandardCharsets.UTF_8);
|
||||
var attributes = ctx.getAttributes();
|
||||
if (modelAndView instanceof MapModelAndView mapModelAndView) {
|
||||
var mapModel = new HashMap<String, Object>();
|
||||
mapModel.putAll(attributes);
|
||||
mapModel.putAll(mapModelAndView.getModel());
|
||||
jte.render(modelAndView.getView(), mapModel, output);
|
||||
} else {
|
||||
jte.render(modelAndView.getView(), modelAndView.getModel(), output);
|
||||
}
|
||||
|
||||
return buffer;
|
||||
}
|
||||
}
|
@ -3,7 +3,6 @@ package nu.marginalia.service.server.mq;
|
||||
import nu.marginalia.mq.MqMessage;
|
||||
import nu.marginalia.mq.inbox.MqInboxResponse;
|
||||
import nu.marginalia.mq.inbox.MqSubscription;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -15,10 +14,10 @@ import java.util.Map;
|
||||
public class ServiceMqSubscription implements MqSubscription {
|
||||
private static final Logger logger = LoggerFactory.getLogger(ServiceMqSubscription.class);
|
||||
private final Map<String, Method> requests = new HashMap<>();
|
||||
private final Service service;
|
||||
private final Object service;
|
||||
|
||||
|
||||
public ServiceMqSubscription(Service service) {
|
||||
public ServiceMqSubscription(Object service) {
|
||||
this.service = service;
|
||||
|
||||
/* Wire up all methods annotated with @MqRequest and @MqNotification
|
||||
|
@ -6,4 +6,8 @@ public record BrowseResultSet(Collection<BrowseResult> results, String focusDoma
|
||||
public BrowseResultSet(Collection<BrowseResult> results) {
|
||||
this(results, "");
|
||||
}
|
||||
|
||||
public boolean hasFocusDomain() {
|
||||
return focusDomain != null && !focusDomain.isBlank();
|
||||
}
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ public class DomainsProtobufCodec {
|
||||
sd.getIndexed(),
|
||||
sd.getActive(),
|
||||
sd.getScreenshot(),
|
||||
sd.getFeed(),
|
||||
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
|
||||
);
|
||||
}
|
||||
|
@ -71,6 +71,23 @@ public class DomainInformation {
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public String getAsnFlag() {
|
||||
if (asnCountry == null || asnCountry.codePointCount(0, asnCountry.length()) != 2) {
|
||||
return "";
|
||||
}
|
||||
String country = asnCountry;
|
||||
|
||||
if ("UK".equals(country)) {
|
||||
country = "GB";
|
||||
}
|
||||
|
||||
int offset = 0x1F1E6;
|
||||
int asciiOffset = 0x41;
|
||||
int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset;
|
||||
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
|
||||
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
|
||||
}
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return this.domain;
|
||||
}
|
||||
|
@ -9,6 +9,7 @@ public record SimilarDomain(EdgeUrl url,
|
||||
boolean indexed,
|
||||
boolean active,
|
||||
boolean screenshot,
|
||||
boolean feed,
|
||||
LinkType linkType) {
|
||||
|
||||
public String getRankSymbols() {
|
||||
@ -52,12 +53,12 @@ public record SimilarDomain(EdgeUrl url,
|
||||
return NONE;
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
public String faIcon() {
|
||||
return switch (this) {
|
||||
case FOWARD -> "→";
|
||||
case BACKWARD -> "←";
|
||||
case BIDIRECTIONAL -> "⇆";
|
||||
case NONE -> "-";
|
||||
case FOWARD -> "fa-solid fa-arrow-right";
|
||||
case BACKWARD -> "fa-solid fa-arrow-left";
|
||||
case BIDIRECTIONAL -> "fa-solid fa-arrow-right-arrow-left";
|
||||
case NONE -> "";
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -7,4 +7,8 @@ public record DictionaryResponse(String word, List<DictionaryEntry> entries) {
|
||||
this.word = word;
|
||||
this.entries = entries.stream().toList(); // Make an immutable copy
|
||||
}
|
||||
|
||||
public boolean hasEntries() {
|
||||
return !entries.isEmpty();
|
||||
}
|
||||
}
|
||||
|
@ -11,7 +11,7 @@ import nu.marginalia.api.svc.RateLimiterService;
|
||||
import nu.marginalia.api.svc.ResponseCache;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.mq.MqRequest;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -21,7 +21,7 @@ import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
public class ApiService extends Service {
|
||||
public class ApiService extends SparkService {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Gson gson = GsonFactory.get();
|
||||
@ -69,7 +69,7 @@ public class ApiService extends Service {
|
||||
this.searchOperator = searchOperator;
|
||||
|
||||
Spark.get("/api/", (rq, rsp) -> {
|
||||
rsp.redirect("https://memex.marginalia.nu/projects/edge/api.gmi");
|
||||
rsp.redirect("https://about.marginalia-search.com/article/api/");
|
||||
return "";
|
||||
});
|
||||
|
||||
|
@ -9,7 +9,7 @@ import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
@ -18,7 +18,7 @@ import spark.Spark;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class DatingService extends Service {
|
||||
public class DatingService extends SparkService {
|
||||
private final DomainBlacklist blacklist;
|
||||
private final DbBrowseDomainsSimilarCosine browseSimilarCosine;
|
||||
private final DbBrowseDomainsRandom browseRandom;
|
||||
|
@ -5,7 +5,7 @@ import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.Service;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
import spark.Request;
|
||||
@ -15,7 +15,7 @@ import spark.Spark;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
public class ExplorerService extends Service {
|
||||
public class ExplorerService extends SparkService {
|
||||
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final HikariDataSource dataSource;
|
||||
|
94
code/services-application/search-service-legacy/build.gradle
Normal file
94
code/services-application/search-service-legacy/build.gradle
Normal file
@ -0,0 +1,94 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
id 'io.freefair.sass-base' version '8.4'
|
||||
id 'io.freefair.sass-java' version '8.4'
|
||||
id 'application'
|
||||
id 'jvm-test-suite'
|
||||
|
||||
id 'com.google.cloud.tools.jib' version '3.4.3'
|
||||
}
|
||||
|
||||
application {
|
||||
mainClass = 'nu.marginalia.search.SearchMain'
|
||||
applicationName = 'search-service-legacy'
|
||||
}
|
||||
|
||||
tasks.distZip.enabled = false
|
||||
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
sass {
|
||||
sourceMapEnabled = true
|
||||
sourceMapEmbed = true
|
||||
outputStyle = EXPANDED
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
apply from: "$rootProject.projectDir/docker.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:model')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:index:query')
|
||||
|
||||
implementation project(':code:libraries:easy-lsh')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:libraries:braille-block-punch-cards')
|
||||
implementation project(':code:libraries:term-frequency-dict')
|
||||
|
||||
implementation project(':code:functions:live-capture:api')
|
||||
implementation project(':code:functions:math:api')
|
||||
implementation project(':code:functions:domain-info:api')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
|
||||
|
||||
implementation project(':code:index:api')
|
||||
implementation project(':code:common:renderer')
|
||||
|
||||
implementation project(':code:features-search:screenshots')
|
||||
implementation project(':code:features-search:random-websites')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
|
||||
implementation libs.roaringbitmap
|
||||
implementation libs.prometheus
|
||||
implementation libs.notnull
|
||||
implementation libs.guava
|
||||
implementation dependencies.create(libs.guice.get()) {
|
||||
exclude group: 'com.google.guava'
|
||||
}
|
||||
implementation libs.handlebars
|
||||
implementation dependencies.create(libs.spark.get()) {
|
||||
exclude group: 'org.eclipse.jetty'
|
||||
}
|
||||
implementation libs.bundles.jetty
|
||||
implementation libs.opencsv
|
||||
implementation libs.trove
|
||||
implementation libs.fastutil
|
||||
implementation libs.bundles.gson
|
||||
implementation libs.bundles.mariadb
|
||||
implementation libs.bundles.nlp
|
||||
|
||||
testImplementation libs.bundles.slf4j.test
|
||||
testImplementation libs.bundles.junit
|
||||
testImplementation libs.mockito
|
||||
|
||||
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
|
||||
testImplementation libs.commons.codec
|
||||
testImplementation 'org.testcontainers:mariadb:1.17.4'
|
||||
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
|
||||
testImplementation project(':code:libraries:test-helpers')
|
||||
}
|
||||
|
||||
tasks.register('paperDoll', Test) {
|
||||
useJUnitPlatform {
|
||||
includeTags "paperdoll"
|
||||
}
|
||||
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Injector;
|
||||
import nu.marginalia.service.MainClass;
|
||||
import nu.marginalia.service.discovery.ServiceRegistryIf;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.service.module.ServiceDiscoveryModule;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.module.ServiceConfigurationModule;
|
||||
import nu.marginalia.service.module.DatabaseModule;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import spark.Spark;
|
||||
|
||||
public class SearchMain extends MainClass {
|
||||
private final SearchService service;
|
||||
|
||||
@Inject
|
||||
public SearchMain(SearchService service) {
|
||||
this.service = service;
|
||||
}
|
||||
|
||||
public static void main(String... args) {
|
||||
|
||||
init(ServiceId.Search, args);
|
||||
|
||||
Spark.staticFileLocation("/static/search/");
|
||||
|
||||
Injector injector = Guice.createInjector(
|
||||
new SearchModule(),
|
||||
new ServiceConfigurationModule(ServiceId.Search),
|
||||
new ServiceDiscoveryModule(),
|
||||
new DatabaseModule(false)
|
||||
);
|
||||
|
||||
|
||||
// Orchestrate the boot order for the services
|
||||
var registry = injector.getInstance(ServiceRegistryIf.class);
|
||||
var configuration = injector.getInstance(ServiceConfiguration.class);
|
||||
orchestrateBoot(registry, configuration);
|
||||
|
||||
injector.getInstance(SearchMain.class);
|
||||
injector.getInstance(Initialization.class).setReady();
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import nu.marginalia.LanguageModels;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.WmsaHome;
|
||||
import nu.marginalia.renderer.config.HandlebarsConfigurator;
|
||||
|
||||
public class SearchModule extends AbstractModule {
|
||||
|
||||
public void configure() {
|
||||
bind(HandlebarsConfigurator.class).to(SearchHandlebarsConfigurator.class);
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
|
||||
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
|
||||
System.getProperty("search.websiteUrl", "https://search.marginalia.nu/")));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,266 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.searchquery.QueryClient;
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import nu.marginalia.search.model.SearchFilters;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.results.UrlDeduplicator;
|
||||
import nu.marginalia.search.svc.SearchQueryCountService;
|
||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||
import org.apache.logging.log4j.util.Strings;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.slf4j.Marker;
|
||||
import org.slf4j.MarkerFactory;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
@Singleton
|
||||
public class SearchOperator {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchOperator.class);
|
||||
|
||||
// Marker for filtering out sensitive content from the persistent logs
|
||||
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
|
||||
|
||||
private final MathClient mathClient;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final QueryClient queryClient;
|
||||
private final SearchQueryParamFactory paramFactory;
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchUnitConversionService searchUnitConversionService;
|
||||
private final SearchQueryCountService searchVisitorCount;
|
||||
|
||||
|
||||
@Inject
|
||||
public SearchOperator(MathClient mathClient,
|
||||
DbDomainQueries domainQueries,
|
||||
QueryClient queryClient,
|
||||
SearchQueryParamFactory paramFactory,
|
||||
WebsiteUrl websiteUrl,
|
||||
SearchUnitConversionService searchUnitConversionService,
|
||||
SearchQueryCountService searchVisitorCount
|
||||
)
|
||||
{
|
||||
|
||||
this.mathClient = mathClient;
|
||||
this.domainQueries = domainQueries;
|
||||
this.queryClient = queryClient;
|
||||
this.paramFactory = paramFactory;
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.searchUnitConversionService = searchUnitConversionService;
|
||||
this.searchVisitorCount = searchVisitorCount;
|
||||
}
|
||||
|
||||
public List<UrlDetails> doSiteSearch(String domain,
|
||||
int domainId,
|
||||
int count) {
|
||||
|
||||
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return getResultsFromQuery(queryResponse);
|
||||
}
|
||||
|
||||
public List<UrlDetails> doBacklinkSearch(String domain) {
|
||||
|
||||
var queryParams = paramFactory.forBacklinkSearch(domain);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return getResultsFromQuery(queryResponse);
|
||||
}
|
||||
|
||||
public List<UrlDetails> doLinkSearch(String source, String dest) {
|
||||
var queryParams = paramFactory.forLinkSearch(source, dest);
|
||||
var queryResponse = queryClient.search(queryParams);
|
||||
|
||||
return getResultsFromQuery(queryResponse);
|
||||
}
|
||||
|
||||
public DecoratedSearchResults doSearch(SearchParameters userParams) throws InterruptedException {
|
||||
// The full user-facing search query does additional work to try to evaluate the query
|
||||
// e.g. as a unit conversion query. This is done in parallel with the regular search.
|
||||
|
||||
Future<String> eval = searchUnitConversionService.tryEval(userParams.query());
|
||||
|
||||
// Perform the regular search
|
||||
|
||||
var queryParams = paramFactory.forRegularSearch(userParams);
|
||||
QueryResponse queryResponse = queryClient.search(queryParams);
|
||||
var queryResults = getResultsFromQuery(queryResponse);
|
||||
|
||||
// Cluster the results based on the query response
|
||||
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
|
||||
.selectStrategy(queryResponse)
|
||||
.clusterResults(queryResults, 25);
|
||||
|
||||
// Log the query and results
|
||||
|
||||
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
|
||||
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
|
||||
|
||||
// Get the evaluation result and other data to return to the user
|
||||
String evalResult = getFutureOrDefault(eval, "");
|
||||
|
||||
String focusDomain = queryResponse.domain();
|
||||
int focusDomainId = focusDomain == null
|
||||
? -1
|
||||
: domainQueries.tryGetDomainId(new EdgeDomain(focusDomain)).orElse(-1);
|
||||
|
||||
List<String> problems = getProblems(evalResult, queryResults, queryResponse);
|
||||
|
||||
List<DecoratedSearchResults.Page> resultPages = IntStream.rangeClosed(1, queryResponse.totalPages())
|
||||
.mapToObj(number -> new DecoratedSearchResults.Page(
|
||||
number,
|
||||
number == userParams.page(),
|
||||
userParams.withPage(number).renderUrl(websiteUrl)
|
||||
))
|
||||
.toList();
|
||||
|
||||
// Return the results to the user
|
||||
return DecoratedSearchResults.builder()
|
||||
.params(userParams)
|
||||
.problems(problems)
|
||||
.evalResult(evalResult)
|
||||
.results(clusteredResults)
|
||||
.filters(new SearchFilters(websiteUrl, userParams))
|
||||
.focusDomain(focusDomain)
|
||||
.focusDomainId(focusDomainId)
|
||||
.resultPages(resultPages)
|
||||
.build();
|
||||
}
|
||||
|
||||
|
||||
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
|
||||
final QueryLimits limits = queryResponse.specs().queryLimits;
|
||||
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
|
||||
|
||||
// Update the query count (this is what you see on the front page)
|
||||
searchVisitorCount.registerQuery();
|
||||
|
||||
return queryResponse.results().stream()
|
||||
.filter(deduplicator::shouldRetain)
|
||||
.limit(limits.resultsTotal())
|
||||
.map(SearchOperator::createDetails)
|
||||
.toList();
|
||||
}
|
||||
|
||||
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
|
||||
return new UrlDetails(
|
||||
item.documentId(),
|
||||
item.domainId(),
|
||||
cleanUrl(item.url),
|
||||
item.title,
|
||||
item.description,
|
||||
item.format,
|
||||
item.features,
|
||||
DomainIndexingState.ACTIVE,
|
||||
item.rankingScore, // termScore
|
||||
item.resultsFromDomain,
|
||||
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
|
||||
Long.bitCount(item.bestPositions),
|
||||
item.rawIndexResult,
|
||||
item.rawIndexResult.keywordScores
|
||||
);
|
||||
}
|
||||
|
||||
/** Replace nuisance domains with replacements where available */
|
||||
private static EdgeUrl cleanUrl(EdgeUrl url) {
|
||||
String topdomain = url.domain.topDomain;
|
||||
String subdomain = url.domain.subDomain;
|
||||
String path = url.path;
|
||||
|
||||
if (topdomain.equals("fandom.com")) {
|
||||
int wikiIndex = path.indexOf("/wiki/");
|
||||
if (wikiIndex >= 0) {
|
||||
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
|
||||
}
|
||||
}
|
||||
else if (topdomain.equals("medium.com")) {
|
||||
if (!subdomain.isBlank()) {
|
||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
|
||||
}
|
||||
else {
|
||||
String article = path.substring(path.indexOf("/", 1));
|
||||
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
|
||||
}
|
||||
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) throws InterruptedException {
|
||||
|
||||
// We don't debug the query if it's a site search
|
||||
if (response.domain() == null)
|
||||
return List.of();
|
||||
|
||||
final List<String> problems = new ArrayList<>(response.problems());
|
||||
|
||||
if (queryResults.size() <= 5 && null == evalResult) {
|
||||
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results.");
|
||||
|
||||
// Try to spell check the search terms
|
||||
var suggestions = getFutureOrDefault(
|
||||
mathClient.spellCheck(response.searchTermsHuman()),
|
||||
Map.of()
|
||||
);
|
||||
|
||||
suggestions.forEach((term, suggestion) -> {
|
||||
if (suggestion.size() > 1) {
|
||||
String suggestionsStr = "\"%s\" could be spelled %s".formatted(term, suggestion.stream().map(s -> "\"" + s + "\"").collect(Collectors.joining(", ")));
|
||||
problems.add(suggestionsStr);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
Set<String> representativeKeywords = response.getAllKeywords();
|
||||
if (representativeKeywords.size() > 1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
|
||||
{
|
||||
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
|
||||
}
|
||||
|
||||
return problems;
|
||||
}
|
||||
|
||||
private <T> T getFutureOrDefault(@Nullable Future<T> fut, T defaultValue) {
|
||||
return getFutureOrDefault(fut, Duration.ofMillis(50), defaultValue);
|
||||
}
|
||||
|
||||
private <T> T getFutureOrDefault(@Nullable Future<T> fut, Duration timeout, T defaultValue) {
|
||||
if (fut == null || fut.isCancelled()) {
|
||||
return defaultValue;
|
||||
}
|
||||
try {
|
||||
return fut.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.warn("Error fetching eval result", ex);
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.QueryParams;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
public class SearchQueryParamFactory {
|
||||
|
||||
public QueryParams forRegularSearch(SearchParameters userParams) {
|
||||
SearchQuery prototype = new SearchQuery();
|
||||
var profile = userParams.profile();
|
||||
|
||||
profile.addTacitTerms(prototype);
|
||||
userParams.js().addTacitTerms(prototype);
|
||||
userParams.adtech().addTacitTerms(prototype);
|
||||
|
||||
return new QueryParams(
|
||||
userParams.query(),
|
||||
null,
|
||||
prototype.searchTermsInclude,
|
||||
prototype.searchTermsExclude,
|
||||
prototype.searchTermsPriority,
|
||||
prototype.searchTermsAdvice,
|
||||
profile.getQualityLimit(),
|
||||
profile.getYearLimit(),
|
||||
profile.getSizeLimit(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(5, 100, 200, 8192),
|
||||
profile.searchSetIdentifier.name(),
|
||||
userParams.strategy(),
|
||||
userParams.temporalBias(),
|
||||
userParams.page()
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
public QueryParams forSiteSearch(String domain, int domainId, int count) {
|
||||
return new QueryParams("site:"+domain,
|
||||
null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(domainId),
|
||||
new QueryLimits(count, count, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
public QueryParams forBacklinkSearch(String domain) {
|
||||
return new QueryParams("links:"+domain,
|
||||
null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
|
||||
public QueryParams forLinkSearch(String sourceDomain, String destDomain) {
|
||||
return new QueryParams("site:" + sourceDomain + " links:" + destDomain,
|
||||
null,
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
List.of(),
|
||||
new QueryLimits(100, 100, 100, 512),
|
||||
SearchSetIdentifier.NONE.name(),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.TemporalBias.NONE,
|
||||
1
|
||||
);
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.QueryResponse;
|
||||
import nu.marginalia.search.model.ClusteredUrlDetails;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Functions for clustering search results */
|
||||
public class SearchResultClusterer {
|
||||
private SearchResultClusterer() {}
|
||||
|
||||
public interface SearchResultClusterStrategy {
|
||||
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
|
||||
}
|
||||
|
||||
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
|
||||
if (response.domain() != null && !response.domain().isBlank())
|
||||
return SearchResultClusterer::noOp;
|
||||
|
||||
return SearchResultClusterer::byDomain;
|
||||
}
|
||||
|
||||
/** No clustering, just return the results as is */
|
||||
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.toList();
|
||||
}
|
||||
|
||||
/** Cluster the results by domain, and return the top "total" clusters
|
||||
* sorted by the relevance of the best result
|
||||
*/
|
||||
private static List<ClusteredUrlDetails> byDomain(List<UrlDetails> results, int total) {
|
||||
if (results.isEmpty())
|
||||
return List.of();
|
||||
|
||||
return results.stream()
|
||||
.collect(
|
||||
Collectors.groupingBy(details -> details.domainId)
|
||||
)
|
||||
.values().stream()
|
||||
.map(ClusteredUrlDetails::new)
|
||||
.sorted()
|
||||
.limit(total)
|
||||
.toList();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,128 @@
|
||||
package nu.marginalia.search;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import io.prometheus.client.Counter;
|
||||
import io.prometheus.client.Histogram;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.svc.*;
|
||||
import nu.marginalia.service.server.BaseServiceParams;
|
||||
import nu.marginalia.service.server.SparkService;
|
||||
import nu.marginalia.service.server.StaticResources;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Route;
|
||||
import spark.Spark;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
public class SearchService extends SparkService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final StaticResources staticResources;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
|
||||
private static final Histogram wmsa_search_service_request_time = Histogram.build()
|
||||
.name("wmsa_search_service_request_time")
|
||||
.linearBuckets(0.05, 0.05, 15)
|
||||
.labelNames("matchedPath", "method")
|
||||
.help("Search service request time (seconds)")
|
||||
.register();
|
||||
private static final Counter wmsa_search_service_error_count = Counter.build()
|
||||
.name("wmsa_search_service_error_count")
|
||||
.labelNames("matchedPath", "method")
|
||||
.help("Search service error count")
|
||||
.register();
|
||||
|
||||
@Inject
|
||||
public SearchService(BaseServiceParams params,
|
||||
WebsiteUrl websiteUrl,
|
||||
StaticResources staticResources,
|
||||
SearchFrontPageService frontPageService,
|
||||
SearchErrorPageService errorPageService,
|
||||
SearchAddToCrawlQueueService addToCrawlQueueService,
|
||||
SearchSiteInfoService siteInfoService,
|
||||
SearchCrosstalkService crosstalkService,
|
||||
SearchQueryService searchQueryService)
|
||||
throws Exception
|
||||
{
|
||||
super(params);
|
||||
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.staticResources = staticResources;
|
||||
|
||||
Spark.staticFiles.expireTime(600);
|
||||
|
||||
SearchServiceMetrics.get("/search", searchQueryService::pathSearch);
|
||||
|
||||
SearchServiceMetrics.get("/", frontPageService::render);
|
||||
SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed);
|
||||
SearchServiceMetrics.get("/:resource", this::serveStatic);
|
||||
|
||||
SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling);
|
||||
|
||||
SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir);
|
||||
|
||||
SearchServiceMetrics.get("/site/:site", siteInfoService::handle);
|
||||
SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost);
|
||||
|
||||
SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle);
|
||||
|
||||
Spark.exception(Exception.class, (e,p,q) -> {
|
||||
logger.error("Error during processing", e);
|
||||
wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc();
|
||||
errorPageService.serveError(p, q);
|
||||
});
|
||||
|
||||
Spark.awaitInitialization();
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Wraps a route with a timer and a counter */
|
||||
private static class SearchServiceMetrics implements Route {
|
||||
private final Route delegatedRoute;
|
||||
|
||||
static void get(String path, Route route) {
|
||||
Spark.get(path, new SearchServiceMetrics(route));
|
||||
}
|
||||
static void post(String path, Route route) {
|
||||
Spark.post(path, new SearchServiceMetrics(route));
|
||||
}
|
||||
|
||||
private SearchServiceMetrics(Route delegatedRoute) {
|
||||
this.delegatedRoute = delegatedRoute;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Object handle(Request request, Response response) throws Exception {
|
||||
return wmsa_search_service_request_time
|
||||
.labels(request.matchedPath(), request.requestMethod())
|
||||
.time(() -> delegatedRoute.handle(request, response));
|
||||
}
|
||||
}
|
||||
|
||||
private Object serveStatic(Request request, Response response) {
|
||||
String resource = request.params("resource");
|
||||
staticResources.serveStatic("search", resource, request, response);
|
||||
return "";
|
||||
}
|
||||
|
||||
private Object siteSearchRedir(Request request, Response response) {
|
||||
final String site = request.params("site");
|
||||
final String searchTerms;
|
||||
|
||||
if (request.splat().length == 0) searchTerms = "";
|
||||
else searchTerms = request.splat()[0];
|
||||
|
||||
final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim();
|
||||
final String profile = request.queryParamOrDefault("profile", "yolo");
|
||||
|
||||
response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,43 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.search.command.commands.*;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
public class CommandEvaluator {
|
||||
|
||||
private final List<SearchCommandInterface> specialCommands = new ArrayList<>();
|
||||
private final SearchCommand defaultCommand;
|
||||
|
||||
@Inject
|
||||
public CommandEvaluator(
|
||||
BrowseCommand browse,
|
||||
ConvertCommand convert,
|
||||
DefinitionCommand define,
|
||||
BangCommand bang,
|
||||
SiteRedirectCommand siteRedirect,
|
||||
SearchCommand search
|
||||
) {
|
||||
specialCommands.add(browse);
|
||||
specialCommands.add(convert);
|
||||
specialCommands.add(define);
|
||||
specialCommands.add(bang);
|
||||
specialCommands.add(siteRedirect);
|
||||
|
||||
defaultCommand = search;
|
||||
}
|
||||
|
||||
public Object eval(Response response, SearchParameters parameters) {
|
||||
for (var cmd : specialCommands) {
|
||||
var maybe = cmd.process(response, parameters);
|
||||
if (maybe.isPresent())
|
||||
return maybe.get();
|
||||
}
|
||||
|
||||
return defaultCommand.process(response, parameters).orElse("");
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
|
||||
public enum SearchAdtechParameter {
|
||||
DEFAULT("default"),
|
||||
REDUCE("reduce", "special:ads", "special:affiliate");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
SearchAdtechParameter(String value, String... implictExcludeSearchTerms) {
|
||||
this.value = value;
|
||||
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
|
||||
}
|
||||
|
||||
public static SearchAdtechParameter parse(@Nullable String value) {
|
||||
if (REDUCE.value.equals(value)) return REDUCE;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
@ -0,0 +1,10 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
public interface SearchCommandInterface {
|
||||
Optional<Object> process(Response response, SearchParameters parameters);
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
|
||||
public enum SearchJsParameter {
|
||||
DEFAULT("default"),
|
||||
DENY_JS("no-js", "js:true"),
|
||||
REQUIRE_JS("yes-js", "js:false");
|
||||
|
||||
public final String value;
|
||||
public final String[] implictExcludeSearchTerms;
|
||||
|
||||
SearchJsParameter(String value, String... implictExcludeSearchTerms) {
|
||||
this.value = value;
|
||||
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
|
||||
}
|
||||
|
||||
public static SearchJsParameter parse(@Nullable String value) {
|
||||
if (DENY_JS.value.equals(value)) return DENY_JS;
|
||||
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
|
||||
}
|
||||
}
|
@ -0,0 +1,106 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.search.model.SearchProfile;
|
||||
import spark.Request;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.Objects;
|
||||
|
||||
import static nu.marginalia.search.command.SearchRecentParameter.RECENT;
|
||||
|
||||
public record SearchParameters(String query,
|
||||
SearchProfile profile,
|
||||
SearchJsParameter js,
|
||||
SearchRecentParameter recent,
|
||||
SearchTitleParameter searchTitle,
|
||||
SearchAdtechParameter adtech,
|
||||
boolean newFilter,
|
||||
int page
|
||||
) {
|
||||
|
||||
public SearchParameters(String queryString, Request request) {
|
||||
this(
|
||||
queryString,
|
||||
SearchProfile.getSearchProfile(request.queryParams("profile")),
|
||||
SearchJsParameter.parse(request.queryParams("js")),
|
||||
SearchRecentParameter.parse(request.queryParams("recent")),
|
||||
SearchTitleParameter.parse(request.queryParams("searchTitle")),
|
||||
SearchAdtechParameter.parse(request.queryParams("adtech")),
|
||||
"true".equals(request.queryParams("newfilter")),
|
||||
Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "1"))
|
||||
);
|
||||
}
|
||||
|
||||
public String profileStr() {
|
||||
return profile.filterId;
|
||||
}
|
||||
|
||||
public SearchParameters withProfile(SearchProfile profile) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withJs(SearchJsParameter js) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
public SearchParameters withAdtech(SearchAdtechParameter adtech) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withRecent(SearchRecentParameter recent) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withTitle(SearchTitleParameter title) {
|
||||
return new SearchParameters(query, profile, js, recent, title, adtech, true, page);
|
||||
}
|
||||
|
||||
public SearchParameters withPage(int page) {
|
||||
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, false, page);
|
||||
}
|
||||
|
||||
public String renderUrl(WebsiteUrl baseUrl) {
|
||||
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
|
||||
URLEncoder.encode(query, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
|
||||
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
|
||||
Boolean.valueOf(newFilter).toString(),
|
||||
page
|
||||
);
|
||||
|
||||
return baseUrl.withPath(path);
|
||||
}
|
||||
|
||||
public ResultRankingParameters.TemporalBias temporalBias() {
|
||||
if (recent == RECENT) {
|
||||
return ResultRankingParameters.TemporalBias.RECENT;
|
||||
}
|
||||
else if (profile == SearchProfile.VINTAGE) {
|
||||
return ResultRankingParameters.TemporalBias.OLD;
|
||||
}
|
||||
|
||||
return ResultRankingParameters.TemporalBias.NONE;
|
||||
}
|
||||
|
||||
public QueryStrategy strategy() {
|
||||
if (searchTitle == SearchTitleParameter.TITLE) {
|
||||
return QueryStrategy.REQUIRE_FIELD_TITLE;
|
||||
}
|
||||
|
||||
return QueryStrategy.AUTO;
|
||||
}
|
||||
|
||||
public SpecificationLimit yearLimit() {
|
||||
if (recent == RECENT)
|
||||
return SpecificationLimit.greaterThan(2018);
|
||||
|
||||
return profile.getYearLimit();
|
||||
}
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public enum SearchRecentParameter {
|
||||
DEFAULT("default"),
|
||||
RECENT("recent");
|
||||
|
||||
public final String value;
|
||||
|
||||
SearchRecentParameter(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static SearchRecentParameter parse(@Nullable String value) {
|
||||
if (RECENT.value.equals(value)) return RECENT;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.search.command;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public enum SearchTitleParameter {
|
||||
DEFAULT("default"),
|
||||
TITLE("title");
|
||||
|
||||
public final String value;
|
||||
|
||||
SearchTitleParameter(String value) {
|
||||
this.value = value;
|
||||
}
|
||||
|
||||
public static SearchTitleParameter parse(@Nullable String value) {
|
||||
if (TITLE.value.equals(value)) return TITLE;
|
||||
|
||||
return DEFAULT;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,104 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.exceptions.RedirectException;
|
||||
import spark.Response;
|
||||
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class BangCommand implements SearchCommandInterface {
|
||||
private final Map<String, String> bangsToPattern = new HashMap<>();
|
||||
|
||||
@Inject
|
||||
public BangCommand()
|
||||
{
|
||||
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
|
||||
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
|
||||
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
|
||||
for (var entry : bangsToPattern.entrySet()) {
|
||||
String bangPattern = entry.getKey();
|
||||
String redirectPattern = entry.getValue();
|
||||
|
||||
var match = matchBangPattern(parameters.query(), bangPattern);
|
||||
|
||||
if (match.isPresent()) {
|
||||
var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8));
|
||||
throw new RedirectException(url);
|
||||
}
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
/** If the query contains the bang pattern bangKey, return the query with the bang pattern removed. */
|
||||
Optional<String> matchBangPattern(String query, String bangKey) {
|
||||
var bm = new BangMatcher(query);
|
||||
|
||||
while (bm.findNext(bangKey)) {
|
||||
|
||||
if (!bm.isRelativeSpaceOrInvalid(-1))
|
||||
continue;
|
||||
if (!bm.isRelativeSpaceOrInvalid(bangKey.length()))
|
||||
continue;
|
||||
|
||||
String prefix = bm.prefix().trim();
|
||||
String suffix = bm.suffix(bangKey.length()).trim();
|
||||
|
||||
String ret = (prefix + " " + suffix).trim();
|
||||
|
||||
return Optional.of(ret)
|
||||
.filter(s -> !s.isBlank());
|
||||
}
|
||||
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
private static class BangMatcher {
|
||||
private final String str;
|
||||
private int pos;
|
||||
|
||||
public String prefix() {
|
||||
return str.substring(0, pos);
|
||||
}
|
||||
|
||||
public String suffix(int offset) {
|
||||
if (pos+offset < str.length())
|
||||
return str.substring(pos + offset);
|
||||
return "";
|
||||
}
|
||||
|
||||
public BangMatcher(String str) {
|
||||
this.str = str;
|
||||
this.pos = -1;
|
||||
}
|
||||
|
||||
public boolean findNext(String pattern) {
|
||||
if (pos + 1 >= str.length())
|
||||
return false;
|
||||
|
||||
return (pos = str.indexOf(pattern, pos + 1)) >= 0;
|
||||
}
|
||||
|
||||
public boolean isRelativeSpaceOrInvalid(int offset) {
|
||||
if (offset + pos < 0)
|
||||
return true;
|
||||
if (offset + pos >= str.length())
|
||||
return true;
|
||||
|
||||
return Character.isSpaceChar(str.charAt(offset + pos));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,36 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.svc.SearchUnitConversionService;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
public class ConvertCommand implements SearchCommandInterface {
|
||||
private final SearchUnitConversionService searchUnitConversionService;
|
||||
private final MustacheRenderer<Map<String, String>> conversionRenderer;
|
||||
|
||||
@Inject
|
||||
public ConvertCommand(SearchUnitConversionService searchUnitConversionService, RendererFactory rendererFactory) throws IOException {
|
||||
this.searchUnitConversionService = searchUnitConversionService;
|
||||
|
||||
conversionRenderer = rendererFactory.renderer("search/conversion-results");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
var conversion = searchUnitConversionService.tryConversion(parameters.query());
|
||||
return conversion.map(s -> conversionRenderer.render(Map.of(
|
||||
"query", parameters.query(),
|
||||
"result", s,
|
||||
"profile", parameters.profileStr())
|
||||
));
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,70 @@
|
||||
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import nu.marginalia.api.math.model.DictionaryResponse;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class DefinitionCommand implements SearchCommandInterface {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final MustacheRenderer<DictionaryResponse> dictionaryRenderer;
|
||||
private final MathClient mathClient;
|
||||
|
||||
|
||||
private final Predicate<String> queryPatternPredicate = Pattern.compile("^define:[A-Za-z\\s-0-9]+$").asPredicate();
|
||||
|
||||
@Inject
|
||||
public DefinitionCommand(RendererFactory rendererFactory, MathClient mathClient)
|
||||
throws IOException
|
||||
{
|
||||
|
||||
dictionaryRenderer = rendererFactory.renderer("search/dictionary-results");
|
||||
this.mathClient = mathClient;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
if (!queryPatternPredicate.test(parameters.query())) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
var results = lookupDefinition(parameters.query());
|
||||
|
||||
return Optional.of(dictionaryRenderer.render(results,
|
||||
Map.of("query", parameters.query(),
|
||||
"profile", parameters.profileStr())
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
private DictionaryResponse lookupDefinition(String humanQuery) {
|
||||
String definePrefix = "define:";
|
||||
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
|
||||
|
||||
try {
|
||||
return mathClient
|
||||
.dictionaryLookup(word)
|
||||
.get(250, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
catch (Exception e) {
|
||||
logger.error("Failed to lookup definition for word: " + word, e);
|
||||
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.model.DecoratedSearchResults;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Optional;
|
||||
|
||||
public class SearchCommand implements SearchCommandInterface {
|
||||
private final SearchOperator searchOperator;
|
||||
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
|
||||
|
||||
|
||||
@Inject
|
||||
public SearchCommand(SearchOperator searchOperator,
|
||||
RendererFactory rendererFactory) throws IOException {
|
||||
this.searchOperator = searchOperator;
|
||||
|
||||
searchResultsRenderer = rendererFactory.renderer("search/search-results");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
try {
|
||||
DecoratedSearchResults results = searchOperator.doSearch(parameters);
|
||||
return Optional.of(searchResultsRenderer.render(results));
|
||||
}
|
||||
catch (InterruptedException ex) {
|
||||
Thread.currentThread().interrupt();
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,50 @@
|
||||
package nu.marginalia.search.command.commands;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.search.command.SearchCommandInterface;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Response;
|
||||
|
||||
import java.util.Optional;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class SiteRedirectCommand implements SearchCommandInterface {
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
private final Predicate<String> queryPatternPredicate = Pattern.compile("^(site|links):[.A-Za-z\\-0-9]+$").asPredicate();
|
||||
|
||||
@Inject
|
||||
public SiteRedirectCommand() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Object> process(Response response, SearchParameters parameters) {
|
||||
if (!queryPatternPredicate.test(parameters.query())) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
int idx = parameters.query().indexOf(':');
|
||||
String prefix = parameters.query().substring(0, idx);
|
||||
String domain = parameters.query().substring(idx + 1).toLowerCase();
|
||||
|
||||
// Use an HTML redirect here, so we can use relative URLs
|
||||
String view = switch (prefix) {
|
||||
case "links" -> "links";
|
||||
default -> "info";
|
||||
};
|
||||
|
||||
return Optional.of("""
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<meta charset="UTF-8">
|
||||
<title>Redirecting...</title>
|
||||
<meta http-equiv="refresh" content="0; url=/site/%s?view=%s">
|
||||
""".formatted(domain, view)
|
||||
);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,66 @@
|
||||
package nu.marginalia.search.db;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class DbNearDomainsQuery {
|
||||
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
@Inject
|
||||
public DbNearDomainsQuery(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public List<Integer> getRelatedDomains(String term, Consumer<String> onProblem) {
|
||||
List<Integer> ret = new ArrayList<>();
|
||||
try (var conn = dataSource.getConnection();
|
||||
|
||||
var selfStmt = conn.prepareStatement("""
|
||||
SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?
|
||||
""");
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT NEIGHBOR_ID, ND.INDEXED, ND.STATE FROM EC_DOMAIN_NEIGHBORS_2
|
||||
INNER JOIN EC_DOMAIN ND ON ND.ID=NEIGHBOR_ID
|
||||
WHERE DOMAIN_ID=?
|
||||
""")) {
|
||||
ResultSet rsp;
|
||||
selfStmt.setString(1, term);
|
||||
rsp = selfStmt.executeQuery();
|
||||
int domainId = -1;
|
||||
if (rsp.next()) {
|
||||
domainId = rsp.getInt(1);
|
||||
ret.add(domainId);
|
||||
}
|
||||
|
||||
stmt.setInt(1, domainId);
|
||||
rsp = stmt.executeQuery();
|
||||
|
||||
while (rsp.next()) {
|
||||
int id = rsp.getInt(1);
|
||||
int indexed = rsp.getInt(2);
|
||||
String state = rsp.getString(3);
|
||||
|
||||
if (indexed > 0 && ("ACTIVE".equalsIgnoreCase(state) || "SOCIAL_MEDIA".equalsIgnoreCase(state) || "SPECIAL".equalsIgnoreCase(state))) {
|
||||
ret.add(id);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
catch (Exception ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
if (ret.isEmpty()) {
|
||||
onProblem.accept("Could not find domains adjacent " + term);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,102 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import org.jetbrains.annotations.NotNull;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
|
||||
* and the rest are additional results, for summary display. */
|
||||
public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
|
||||
|
||||
@NotNull
|
||||
public final UrlDetails first;
|
||||
|
||||
@NotNull
|
||||
public final List<UrlDetails> rest;
|
||||
|
||||
/** Create a new ClusteredUrlDetails from a collection of UrlDetails,
|
||||
* with the best result as "first", and the others, in descending order
|
||||
* of quality as the "rest"...
|
||||
*
|
||||
* @param details A collection of UrlDetails, which must not be empty.
|
||||
*/
|
||||
public ClusteredUrlDetails(Collection<UrlDetails> details) {
|
||||
var items = new ArrayList<>(details);
|
||||
|
||||
items.sort(Comparator.naturalOrder());
|
||||
|
||||
if (items.isEmpty())
|
||||
throw new IllegalArgumentException("Empty list of details");
|
||||
|
||||
this.first = items.removeFirst();
|
||||
this.rest = items;
|
||||
|
||||
double bestScore = first.termScore;
|
||||
double scoreLimit = Math.min(4.0, bestScore * 1.25);
|
||||
|
||||
this.rest.removeIf(urlDetail -> {
|
||||
if (urlDetail.termScore > scoreLimit)
|
||||
return false;
|
||||
|
||||
for (var keywordScore : urlDetail.resultItem.keywordScores) {
|
||||
if (keywordScore.isKeywordSpecial())
|
||||
continue;
|
||||
if (keywordScore.hasTermFlag(WordFlags.Title))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
|
||||
return false;
|
||||
if (keywordScore.hasTermFlag(WordFlags.Subjects))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
|
||||
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
|
||||
this.first = onlyFirst;
|
||||
this.rest = Collections.emptyList();
|
||||
}
|
||||
|
||||
// For renderer use, do not remove
|
||||
public @NotNull UrlDetails getFirst() {
|
||||
return first;
|
||||
}
|
||||
|
||||
// For renderer use, do not remove
|
||||
public @NotNull List<UrlDetails> getRest() {
|
||||
return rest;
|
||||
}
|
||||
|
||||
|
||||
public EdgeDomain getDomain() {
|
||||
return first.url.getDomain();
|
||||
}
|
||||
|
||||
public boolean hasMultiple() {
|
||||
return !rest.isEmpty();
|
||||
}
|
||||
|
||||
/** Returns the total number of results from the same domain,
|
||||
* including such results that are not included here. */
|
||||
public int totalCount() {
|
||||
return first.resultsFromSameDomain;
|
||||
}
|
||||
|
||||
public int remainingCount() {
|
||||
return totalCount() - 1 - rest.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(@NotNull ClusteredUrlDetails o) {
|
||||
return Objects.compare(first, o.first, UrlDetails::compareTo);
|
||||
}
|
||||
}
|
@ -0,0 +1,186 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A class to hold details about the search results,
|
||||
* as used by the handlebars templating engine to render
|
||||
* the search results page.
|
||||
*/
|
||||
public class DecoratedSearchResults {
|
||||
private final SearchParameters params;
|
||||
private final List<String> problems;
|
||||
private final String evalResult;
|
||||
|
||||
public DecoratedSearchResults(SearchParameters params,
|
||||
List<String> problems,
|
||||
String evalResult,
|
||||
List<ClusteredUrlDetails> results,
|
||||
String focusDomain,
|
||||
int focusDomainId,
|
||||
SearchFilters filters,
|
||||
List<Page> resultPages) {
|
||||
this.params = params;
|
||||
this.problems = problems;
|
||||
this.evalResult = evalResult;
|
||||
this.results = results;
|
||||
this.focusDomain = focusDomain;
|
||||
this.focusDomainId = focusDomainId;
|
||||
this.filters = filters;
|
||||
this.resultPages = resultPages;
|
||||
}
|
||||
|
||||
public final List<ClusteredUrlDetails> results;
|
||||
|
||||
public static DecoratedSearchResultsBuilder builder() {
|
||||
return new DecoratedSearchResultsBuilder();
|
||||
}
|
||||
|
||||
public SearchParameters getParams() {
|
||||
return params;
|
||||
}
|
||||
|
||||
public List<String> getProblems() {
|
||||
return problems;
|
||||
}
|
||||
|
||||
public String getEvalResult() {
|
||||
return evalResult;
|
||||
}
|
||||
|
||||
public List<ClusteredUrlDetails> getResults() {
|
||||
return results;
|
||||
}
|
||||
|
||||
public String getFocusDomain() {
|
||||
return focusDomain;
|
||||
}
|
||||
|
||||
public int getFocusDomainId() {
|
||||
return focusDomainId;
|
||||
}
|
||||
|
||||
public SearchFilters getFilters() {
|
||||
return filters;
|
||||
}
|
||||
|
||||
public List<Page> getResultPages() {
|
||||
return resultPages;
|
||||
}
|
||||
|
||||
private final String focusDomain;
|
||||
private final int focusDomainId;
|
||||
private final SearchFilters filters;
|
||||
|
||||
private final List<Page> resultPages;
|
||||
|
||||
public boolean isMultipage() {
|
||||
return resultPages.size() > 1;
|
||||
}
|
||||
|
||||
public record Page(int number, boolean current, String href) {
|
||||
}
|
||||
|
||||
// These are used by the search form, they look unused in the IDE but are used by the mustache template,
|
||||
// DO NOT REMOVE THEM
|
||||
public int getResultCount() {
|
||||
return results.size();
|
||||
}
|
||||
|
||||
public String getQuery() {
|
||||
return params.query();
|
||||
}
|
||||
|
||||
public String getProfile() {
|
||||
return params.profile().filterId;
|
||||
}
|
||||
|
||||
public String getJs() {
|
||||
return params.js().value;
|
||||
}
|
||||
|
||||
public String getAdtech() {
|
||||
return params.adtech().value;
|
||||
}
|
||||
|
||||
public String getRecent() {
|
||||
return params.recent().value;
|
||||
}
|
||||
|
||||
public String getSearchTitle() {
|
||||
return params.searchTitle().value;
|
||||
}
|
||||
|
||||
public int page() {
|
||||
return params.page();
|
||||
}
|
||||
|
||||
public Boolean isNewFilter() {
|
||||
return params.newFilter();
|
||||
}
|
||||
|
||||
|
||||
public static class DecoratedSearchResultsBuilder {
|
||||
private SearchParameters params;
|
||||
private List<String> problems;
|
||||
private String evalResult;
|
||||
private List<ClusteredUrlDetails> results;
|
||||
private String focusDomain;
|
||||
private int focusDomainId;
|
||||
private SearchFilters filters;
|
||||
private List<Page> resultPages;
|
||||
|
||||
DecoratedSearchResultsBuilder() {
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder params(SearchParameters params) {
|
||||
this.params = params;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder problems(List<String> problems) {
|
||||
this.problems = problems;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder evalResult(String evalResult) {
|
||||
this.evalResult = evalResult;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder results(List<ClusteredUrlDetails> results) {
|
||||
this.results = results;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder focusDomain(String focusDomain) {
|
||||
this.focusDomain = focusDomain;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder focusDomainId(int focusDomainId) {
|
||||
this.focusDomainId = focusDomainId;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder filters(SearchFilters filters) {
|
||||
this.filters = filters;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResultsBuilder resultPages(List<Page> resultPages) {
|
||||
this.resultPages = resultPages;
|
||||
return this;
|
||||
}
|
||||
|
||||
public DecoratedSearchResults build() {
|
||||
return new DecoratedSearchResults(this.params, this.problems, this.evalResult, this.results, this.focusDomain, this.focusDomainId, this.filters, this.resultPages);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "DecoratedSearchResults.DecoratedSearchResultsBuilder(params=" + this.params + ", problems=" + this.problems + ", evalResult=" + this.evalResult + ", results=" + this.results + ", focusDomain=" + this.focusDomain + ", focusDomainId=" + this.focusDomainId + ", filters=" + this.filters + ", resultPages=" + this.resultPages + ")";
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,223 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.command.*;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
/** Models the search filters displayed next to the search results */
|
||||
public class SearchFilters {
|
||||
private final WebsiteUrl url;
|
||||
|
||||
public final String currentFilter;
|
||||
|
||||
// These are necessary for the renderer to access the data
|
||||
public final RemoveJsOption removeJsOption;
|
||||
public final ReduceAdtechOption reduceAdtechOption;
|
||||
public final ShowRecentOption showRecentOption;
|
||||
public final SearchTitleOption searchTitleOption;
|
||||
|
||||
public final List<List<Filter>> filterGroups;
|
||||
|
||||
// Getters are for the renderer to access the data
|
||||
|
||||
|
||||
public String getCurrentFilter() {
|
||||
return currentFilter;
|
||||
}
|
||||
|
||||
public RemoveJsOption getRemoveJsOption() {
|
||||
return removeJsOption;
|
||||
}
|
||||
|
||||
public ReduceAdtechOption getReduceAdtechOption() {
|
||||
return reduceAdtechOption;
|
||||
}
|
||||
|
||||
public ShowRecentOption getShowRecentOption() {
|
||||
return showRecentOption;
|
||||
}
|
||||
|
||||
public SearchTitleOption getSearchTitleOption() {
|
||||
return searchTitleOption;
|
||||
}
|
||||
|
||||
public List<List<Filter>> getFilterGroups() {
|
||||
return filterGroups;
|
||||
}
|
||||
|
||||
public SearchFilters(WebsiteUrl url, SearchParameters parameters) {
|
||||
this.url = url;
|
||||
|
||||
removeJsOption = new RemoveJsOption(parameters);
|
||||
reduceAdtechOption = new ReduceAdtechOption(parameters);
|
||||
showRecentOption = new ShowRecentOption(parameters);
|
||||
searchTitleOption = new SearchTitleOption(parameters);
|
||||
|
||||
|
||||
currentFilter = parameters.profile().filterId;
|
||||
|
||||
filterGroups = List.of(
|
||||
List.of(
|
||||
new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
|
||||
// new Filter("Popular", SearchProfile.POPULAR, parameters),
|
||||
new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
|
||||
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
|
||||
new Filter("Academia", SearchProfile.ACADEMIA, parameters)
|
||||
),
|
||||
List.of(
|
||||
new Filter("Vintage", SearchProfile.VINTAGE, parameters),
|
||||
new Filter("Plain Text", SearchProfile.PLAIN_TEXT, parameters),
|
||||
new Filter("~tilde", SearchProfile.TILDE, parameters)
|
||||
),
|
||||
List.of(
|
||||
new Filter("Wiki", SearchProfile.WIKI, parameters),
|
||||
new Filter("Forum", SearchProfile.FORUM, parameters),
|
||||
new Filter("Docs", SearchProfile.DOCS, parameters),
|
||||
new Filter("Recipes", SearchProfile.FOOD, parameters)
|
||||
)
|
||||
);
|
||||
|
||||
|
||||
}
|
||||
|
||||
public class RemoveJsOption {
|
||||
private final SearchJsParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchJsParameter.DENY_JS);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Remove Javascript";
|
||||
}
|
||||
|
||||
public RemoveJsOption(SearchParameters parameters) {
|
||||
this.value = parameters.js();
|
||||
|
||||
var toggledValue = switch (parameters.js()) {
|
||||
case DENY_JS -> SearchJsParameter.DEFAULT;
|
||||
default -> SearchJsParameter.DENY_JS;
|
||||
};
|
||||
|
||||
this.url = parameters.withJs(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class ReduceAdtechOption {
|
||||
private final SearchAdtechParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchAdtechParameter.REDUCE);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Reduce Adtech";
|
||||
}
|
||||
|
||||
public ReduceAdtechOption(SearchParameters parameters) {
|
||||
this.value = parameters.adtech();
|
||||
|
||||
var toggledValue = switch (parameters.adtech()) {
|
||||
case REDUCE -> SearchAdtechParameter.DEFAULT;
|
||||
default -> SearchAdtechParameter.REDUCE;
|
||||
};
|
||||
|
||||
this.url = parameters.withAdtech(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class ShowRecentOption {
|
||||
private final SearchRecentParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchRecentParameter.RECENT);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Recent Results";
|
||||
}
|
||||
|
||||
public ShowRecentOption(SearchParameters parameters) {
|
||||
this.value = parameters.recent();
|
||||
|
||||
var toggledValue = switch (parameters.recent()) {
|
||||
case RECENT -> SearchRecentParameter.DEFAULT;
|
||||
default -> SearchRecentParameter.RECENT;
|
||||
};
|
||||
|
||||
this.url = parameters.withRecent(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class SearchTitleOption {
|
||||
private final SearchTitleParameter value;
|
||||
|
||||
public final String url;
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
|
||||
public boolean isSet() {
|
||||
return value.equals(SearchTitleParameter.TITLE);
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return "Search In Title";
|
||||
}
|
||||
|
||||
public SearchTitleOption(SearchParameters parameters) {
|
||||
this.value = parameters.searchTitle();
|
||||
|
||||
var toggledValue = switch (parameters.searchTitle()) {
|
||||
case TITLE -> SearchTitleParameter.DEFAULT;
|
||||
default -> SearchTitleParameter.TITLE;
|
||||
};
|
||||
|
||||
this.url = parameters.withTitle(toggledValue).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
}
|
||||
|
||||
public class Filter {
|
||||
public final SearchProfile profile;
|
||||
|
||||
public final String displayName;
|
||||
public final boolean current;
|
||||
public final String url;
|
||||
|
||||
public Filter(String displayName, SearchProfile profile, SearchParameters parameters) {
|
||||
this.displayName = displayName;
|
||||
this.profile = profile;
|
||||
this.current = profile.equals(parameters.profile());
|
||||
|
||||
this.url = parameters.withProfile(profile).renderUrl(SearchFilters.this.url);
|
||||
}
|
||||
|
||||
public String getDisplayName() {
|
||||
return displayName;
|
||||
}
|
||||
|
||||
public boolean isCurrent() {
|
||||
return current;
|
||||
}
|
||||
|
||||
public String getUrl() {
|
||||
return url;
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,105 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public enum SearchProfile {
|
||||
POPULAR("default", SearchSetIdentifier.POPULAR),
|
||||
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
|
||||
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
|
||||
NO_FILTER("corpo", SearchSetIdentifier.NONE),
|
||||
VINTAGE("vintage", SearchSetIdentifier.NONE),
|
||||
TILDE("tilde", SearchSetIdentifier.NONE),
|
||||
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
|
||||
ACADEMIA("academia", SearchSetIdentifier.NONE),
|
||||
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
|
||||
FOOD("food", SearchSetIdentifier.POPULAR),
|
||||
FORUM("forum", SearchSetIdentifier.NONE),
|
||||
WIKI("wiki", SearchSetIdentifier.NONE),
|
||||
DOCS("docs", SearchSetIdentifier.NONE),
|
||||
;
|
||||
|
||||
|
||||
public final String filterId;
|
||||
public final SearchSetIdentifier searchSetIdentifier;
|
||||
|
||||
SearchProfile(String filterId, SearchSetIdentifier searchSetIdentifier) {
|
||||
this.filterId = filterId;
|
||||
this.searchSetIdentifier = searchSetIdentifier;
|
||||
}
|
||||
|
||||
private final static SearchProfile[] values = values();
|
||||
public static SearchProfile getSearchProfile(String param) {
|
||||
if (null == param) {
|
||||
return NO_FILTER;
|
||||
}
|
||||
|
||||
for (var profile : values) {
|
||||
if (Objects.equals(profile.filterId, param)) {
|
||||
return profile;
|
||||
}
|
||||
}
|
||||
|
||||
return NO_FILTER;
|
||||
}
|
||||
|
||||
public void addTacitTerms(SearchQuery subquery) {
|
||||
if (this == ACADEMIA) {
|
||||
subquery.searchTermsAdvice.add("special:academia");
|
||||
}
|
||||
if (this == VINTAGE) {
|
||||
subquery.searchTermsPriority.add("format:html123");
|
||||
subquery.searchTermsPriority.add("js:false");
|
||||
}
|
||||
if (this == TILDE) {
|
||||
subquery.searchTermsAdvice.add("special:tilde");
|
||||
}
|
||||
if (this == PLAIN_TEXT) {
|
||||
subquery.searchTermsAdvice.add("format:plain");
|
||||
}
|
||||
if (this == WIKI) {
|
||||
subquery.searchTermsAdvice.add("generator:wiki");
|
||||
}
|
||||
if (this == FORUM) {
|
||||
subquery.searchTermsAdvice.add("generator:forum");
|
||||
}
|
||||
if (this == DOCS) {
|
||||
subquery.searchTermsAdvice.add("generator:docs");
|
||||
}
|
||||
if (this == FOOD) {
|
||||
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
|
||||
subquery.searchTermsExclude.add("special:ads");
|
||||
}
|
||||
}
|
||||
|
||||
public SpecificationLimit getYearLimit() {
|
||||
if (this == SMALLWEB) {
|
||||
return SpecificationLimit.greaterThan(2015);
|
||||
}
|
||||
if (this == VINTAGE) {
|
||||
return SpecificationLimit.lessThan(2003);
|
||||
}
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
public SpecificationLimit getSizeLimit() {
|
||||
if (this == SMALLWEB) {
|
||||
return SpecificationLimit.lessThan(500);
|
||||
}
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
|
||||
public SpecificationLimit getQualityLimit() {
|
||||
if (this == SMALLWEB) {
|
||||
return SpecificationLimit.lessThan(5);
|
||||
}
|
||||
else return SpecificationLimit.none();
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -0,0 +1,293 @@
|
||||
package nu.marginalia.search.model;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.crawl.DomainIndexingState;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* A class to hold details about a single search result.
|
||||
*/
|
||||
public class UrlDetails implements Comparable<UrlDetails> {
|
||||
public long id;
|
||||
public int domainId;
|
||||
|
||||
public EdgeUrl url;
|
||||
public String title;
|
||||
public String description;
|
||||
|
||||
public String format;
|
||||
public int features;
|
||||
|
||||
public DomainIndexingState domainState;
|
||||
|
||||
public double termScore;
|
||||
|
||||
public int resultsFromSameDomain;
|
||||
|
||||
public String positions;
|
||||
public int positionsCount;
|
||||
public SearchResultItem resultItem;
|
||||
public List<SearchResultKeywordScore> keywordScores;
|
||||
|
||||
public UrlDetails(long id, int domainId, EdgeUrl url, String title, String description, String format, int features, DomainIndexingState domainState, double termScore, int resultsFromSameDomain, String positions, int positionsCount, SearchResultItem resultItem, List<SearchResultKeywordScore> keywordScores) {
|
||||
this.id = id;
|
||||
this.domainId = domainId;
|
||||
this.url = url;
|
||||
this.title = title;
|
||||
this.description = description;
|
||||
this.format = format;
|
||||
this.features = features;
|
||||
this.domainState = domainState;
|
||||
this.termScore = termScore;
|
||||
this.resultsFromSameDomain = resultsFromSameDomain;
|
||||
this.positions = positions;
|
||||
this.positionsCount = positionsCount;
|
||||
this.resultItem = resultItem;
|
||||
this.keywordScores = keywordScores;
|
||||
}
|
||||
|
||||
public UrlDetails() {
|
||||
}
|
||||
|
||||
public boolean hasMoreResults() {
|
||||
return resultsFromSameDomain > 1;
|
||||
}
|
||||
|
||||
public String getFormat() {
|
||||
if (null == format) {
|
||||
return "?";
|
||||
}
|
||||
switch (format) {
|
||||
case "HTML123":
|
||||
return "HTML 1-3";
|
||||
case "HTML4":
|
||||
return "HTML 4";
|
||||
case "XHTML":
|
||||
return "XHTML";
|
||||
case "HTML5":
|
||||
return "HTML 5";
|
||||
case "PLAIN":
|
||||
return "Plain Text";
|
||||
default:
|
||||
return "?";
|
||||
}
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return Long.hashCode(id);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(UrlDetails other) {
|
||||
int result = Double.compare(getTermScore(), other.getTermScore());
|
||||
if (result == 0) result = Long.compare(getId(), other.getId());
|
||||
return result;
|
||||
}
|
||||
|
||||
public boolean equals(Object other) {
|
||||
if (other == null) {
|
||||
return false;
|
||||
}
|
||||
if (other == this) {
|
||||
return true;
|
||||
}
|
||||
if (other instanceof UrlDetails) {
|
||||
return ((UrlDetails) other).id == id;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
if (title == null || title.isBlank()) {
|
||||
return url.toString();
|
||||
}
|
||||
return title;
|
||||
}
|
||||
|
||||
public boolean isPlainText() {
|
||||
return "PLAIN".equals(format);
|
||||
}
|
||||
|
||||
public int getProblemCount() {
|
||||
int mask = HtmlFeature.JS.getFeatureBit()
|
||||
| HtmlFeature.COOKIES.getFeatureBit()
|
||||
| HtmlFeature.TRACKING.getFeatureBit()
|
||||
| HtmlFeature.AFFILIATE_LINK.getFeatureBit()
|
||||
| HtmlFeature.TRACKING_ADTECH.getFeatureBit()
|
||||
| HtmlFeature.ADVERTISEMENT.getFeatureBit();
|
||||
|
||||
return Integer.bitCount(features & mask);
|
||||
}
|
||||
|
||||
public List<UrlProblem> getProblems() {
|
||||
List<UrlProblem> problems = new ArrayList<>();
|
||||
|
||||
if (isScripts()) {
|
||||
problems.add(new UrlProblem("Js", "The page uses Javascript"));
|
||||
}
|
||||
if (isCookies()) {
|
||||
problems.add(new UrlProblem("Co", "The page uses Cookies"));
|
||||
}
|
||||
if (isTracking()) {
|
||||
problems.add(new UrlProblem("Tr", "The page uses Tracking/Analytics"));
|
||||
}
|
||||
if (isAffiliate()) {
|
||||
problems.add(new UrlProblem("Af", "The page may use Affiliate Linking"));
|
||||
}
|
||||
if (isAds()) {
|
||||
problems.add(new UrlProblem("Ad", "The page uses Ads/Adtech Tracking"));
|
||||
}
|
||||
return problems;
|
||||
|
||||
}
|
||||
|
||||
public boolean isScripts() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
|
||||
}
|
||||
|
||||
public boolean isTracking() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
|
||||
}
|
||||
|
||||
public boolean isAffiliate() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
|
||||
}
|
||||
|
||||
public boolean isMedia() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.MEDIA);
|
||||
}
|
||||
|
||||
public boolean isCookies() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
|
||||
}
|
||||
|
||||
public boolean isAds() {
|
||||
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH);
|
||||
}
|
||||
|
||||
public int getMatchRank() {
|
||||
if (termScore <= 1) return 1;
|
||||
if (termScore <= 2) return 2;
|
||||
if (termScore <= 3) return 3;
|
||||
if (termScore <= 5) return 5;
|
||||
|
||||
return 10;
|
||||
}
|
||||
|
||||
public long getId() {
|
||||
return this.id;
|
||||
}
|
||||
|
||||
public int getDomainId() {
|
||||
return this.domainId;
|
||||
}
|
||||
|
||||
public EdgeUrl getUrl() {
|
||||
return this.url;
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return this.description;
|
||||
}
|
||||
|
||||
public int getFeatures() {
|
||||
return this.features;
|
||||
}
|
||||
|
||||
public DomainIndexingState getDomainState() {
|
||||
return this.domainState;
|
||||
}
|
||||
|
||||
public double getTermScore() {
|
||||
return this.termScore;
|
||||
}
|
||||
|
||||
public int getResultsFromSameDomain() {
|
||||
return this.resultsFromSameDomain;
|
||||
}
|
||||
|
||||
public String getPositions() {
|
||||
return this.positions;
|
||||
}
|
||||
|
||||
public int getPositionsCount() {
|
||||
return this.positionsCount;
|
||||
}
|
||||
|
||||
public SearchResultItem getResultItem() {
|
||||
return this.resultItem;
|
||||
}
|
||||
|
||||
public List<SearchResultKeywordScore> getKeywordScores() {
|
||||
return this.keywordScores;
|
||||
}
|
||||
|
||||
public UrlDetails withId(long id) {
|
||||
return this.id == id ? this : new UrlDetails(id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withDomainId(int domainId) {
|
||||
return this.domainId == domainId ? this : new UrlDetails(this.id, domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withUrl(EdgeUrl url) {
|
||||
return this.url == url ? this : new UrlDetails(this.id, this.domainId, url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withTitle(String title) {
|
||||
return this.title == title ? this : new UrlDetails(this.id, this.domainId, this.url, title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withDescription(String description) {
|
||||
return this.description == description ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withFormat(String format) {
|
||||
return this.format == format ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withFeatures(int features) {
|
||||
return this.features == features ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withDomainState(DomainIndexingState domainState) {
|
||||
return this.domainState == domainState ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withTermScore(double termScore) {
|
||||
return this.termScore == termScore ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withResultsFromSameDomain(int resultsFromSameDomain) {
|
||||
return this.resultsFromSameDomain == resultsFromSameDomain ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withPositions(String positions) {
|
||||
return this.positions == positions ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, positions, this.positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withPositionsCount(int positionsCount) {
|
||||
return this.positionsCount == positionsCount ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, positionsCount, this.resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withResultItem(SearchResultItem resultItem) {
|
||||
return this.resultItem == resultItem ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, resultItem, this.keywordScores);
|
||||
}
|
||||
|
||||
public UrlDetails withKeywordScores(List<SearchResultKeywordScore> keywordScores) {
|
||||
return this.keywordScores == keywordScores ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, keywordScores);
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return "UrlDetails(id=" + this.getId() + ", domainId=" + this.getDomainId() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", domainState=" + this.getDomainState() + ", termScore=" + this.getTermScore() + ", resultsFromSameDomain=" + this.getResultsFromSameDomain() + ", positions=" + this.getPositions() + ", positionsCount=" + this.getPositionsCount() + ", resultItem=" + this.getResultItem() + ", keywordScores=" + this.getKeywordScores() + ")";
|
||||
}
|
||||
|
||||
public static record UrlProblem(String name, String description) {
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
package nu.marginalia.search.results;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
@Singleton
|
||||
public class BrowseResultCleaner {
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
@Inject
|
||||
public BrowseResultCleaner(ScreenshotService screenshotService) {
|
||||
this.screenshotService = screenshotService;
|
||||
}
|
||||
|
||||
public Predicate<BrowseResult> shouldRemoveResultPredicateBr() {
|
||||
Set<String> domainHashes = new HashSet<>(100);
|
||||
|
||||
return (res) -> !screenshotService.hasScreenshot(res.domainId())
|
||||
|| !domainHashes.add(res.domainHash());
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.search.results;
|
||||
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||
import gnu.trove.set.hash.TIntHashSet;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.lsh.EasyLSH;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
public class UrlDeduplicator {
|
||||
private final int LSH_SIMILARITY_THRESHOLD = 2;
|
||||
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
|
||||
|
||||
private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200);
|
||||
private final TLongList seehLSHList = new TLongArrayList(200);
|
||||
private final TObjectIntHashMap<String> keyCount = new TObjectIntHashMap<>(200, 0.75f, 0);
|
||||
|
||||
private final int resultsPerKey;
|
||||
public UrlDeduplicator(int resultsPerKey) {
|
||||
this.resultsPerKey = resultsPerKey;
|
||||
}
|
||||
|
||||
public boolean shouldRemove(DecoratedSearchResultItem details) {
|
||||
if (!deduplicateOnSuperficialHash(details))
|
||||
return true;
|
||||
if (!deduplicateOnLSH(details))
|
||||
return true;
|
||||
if (!limitResultsPerDomain(details))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean shouldRetain(DecoratedSearchResultItem details) {
|
||||
return !shouldRemove(details);
|
||||
}
|
||||
|
||||
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
|
||||
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
|
||||
}
|
||||
|
||||
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
|
||||
long thisHash = details.dataHash;
|
||||
|
||||
if (0 == thisHash)
|
||||
return true;
|
||||
|
||||
if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD))
|
||||
{
|
||||
seehLSHList.add(thisHash);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
|
||||
}
|
||||
|
||||
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
|
||||
final var domain = details.getUrl().getDomain();
|
||||
final String key = domain.getDomainKey();
|
||||
|
||||
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
import spark.Spark;
|
||||
|
||||
import java.sql.SQLException;
|
||||
|
||||
public class SearchAddToCrawlQueueService {
|
||||
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final HikariDataSource dataSource;
|
||||
private final Logger logger = LoggerFactory.getLogger(SearchAddToCrawlQueueService.class);
|
||||
|
||||
@Inject
|
||||
public SearchAddToCrawlQueueService(DbDomainQueries domainQueries,
|
||||
WebsiteUrl websiteUrl,
|
||||
HikariDataSource dataSource) {
|
||||
this.domainQueries = domainQueries;
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public Object suggestCrawling(Request request, Response response) throws SQLException {
|
||||
logger.info("{}", request.queryParams());
|
||||
int id = Integer.parseInt(request.queryParams("id"));
|
||||
boolean nomisclick = "on".equals(request.queryParams("nomisclick"));
|
||||
|
||||
String domainName = getDomainName(id);
|
||||
|
||||
if (nomisclick) {
|
||||
logger.info("Adding {} to crawl queue", domainName);
|
||||
addToCrawlQueue(id);
|
||||
}
|
||||
else {
|
||||
logger.info("Nomisclick not set, not adding {} to crawl queue", domainName);
|
||||
}
|
||||
|
||||
response.redirect(websiteUrl.withPath("/site/" + domainName));
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private void addToCrawlQueue(int id) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
|
||||
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
|
||||
""")) {
|
||||
stmt.setInt(1, id);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
private String getDomainName(int id) {
|
||||
var domain = domainQueries.getDomain(id);
|
||||
if (domain.isEmpty())
|
||||
Spark.halt(404);
|
||||
return domain.get().toString();
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,87 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.browse.DbBrowseDomainsRandom;
|
||||
import nu.marginalia.browse.model.BrowseResult;
|
||||
import nu.marginalia.browse.model.BrowseResultSet;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.db.DomainBlacklist;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.search.results.BrowseResultCleaner;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import static java.util.Collections.shuffle;
|
||||
|
||||
public class SearchBrowseService {
|
||||
private final DbBrowseDomainsRandom randomDomains;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final DomainBlacklist blacklist;
|
||||
private final DomainInfoClient domainInfoClient;
|
||||
private final BrowseResultCleaner browseResultCleaner;
|
||||
|
||||
@Inject
|
||||
public SearchBrowseService(DbBrowseDomainsRandom randomDomains,
|
||||
DbDomainQueries domainQueries,
|
||||
DomainBlacklist blacklist,
|
||||
DomainInfoClient domainInfoClient,
|
||||
BrowseResultCleaner browseResultCleaner)
|
||||
{
|
||||
this.randomDomains = randomDomains;
|
||||
this.domainQueries = domainQueries;
|
||||
this.blacklist = blacklist;
|
||||
this.domainInfoClient = domainInfoClient;
|
||||
this.browseResultCleaner = browseResultCleaner;
|
||||
}
|
||||
|
||||
public BrowseResultSet getRandomEntries(int set) {
|
||||
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
|
||||
|
||||
results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr());
|
||||
|
||||
return new BrowseResultSet(results);
|
||||
}
|
||||
|
||||
public BrowseResultSet getRelatedEntries(String domainName) throws ExecutionException, InterruptedException, TimeoutException {
|
||||
var domain = domainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
|
||||
var neighbors = domainInfoClient.similarDomains(domain, 50)
|
||||
.get(100, TimeUnit.MILLISECONDS);
|
||||
|
||||
neighbors.removeIf(sd -> !sd.screenshot());
|
||||
|
||||
// If the results are very few, supplement with the alternative shitty algorithm
|
||||
if (neighbors.size() < 25) {
|
||||
Set<SimilarDomain> allNeighbors = new HashSet<>(neighbors);
|
||||
allNeighbors.addAll(domainInfoClient
|
||||
.linkedDomains(domain, 50)
|
||||
.get(100, TimeUnit.MILLISECONDS)
|
||||
);
|
||||
|
||||
neighbors.clear();
|
||||
neighbors.addAll(allNeighbors);
|
||||
neighbors.removeIf(sd -> !sd.screenshot());
|
||||
}
|
||||
|
||||
List<BrowseResult> results = new ArrayList<>(neighbors.size());
|
||||
for (SimilarDomain sd : neighbors) {
|
||||
var resultDomain = domainQueries.getDomain(sd.domainId());
|
||||
if (resultDomain.isEmpty())
|
||||
continue;
|
||||
|
||||
results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot()));
|
||||
}
|
||||
// shuffle the items for a less repetitive experience
|
||||
shuffle(neighbors);
|
||||
|
||||
return new BrowseResultSet(results, domainName);
|
||||
}
|
||||
}
|
@ -0,0 +1,69 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
|
||||
public class SearchCrosstalkService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchCrosstalkService.class);
|
||||
private final SearchOperator searchOperator;
|
||||
private final MustacheRenderer<CrosstalkResult> renderer;
|
||||
|
||||
@Inject
|
||||
public SearchCrosstalkService(SearchOperator searchOperator,
|
||||
RendererFactory rendererFactory) throws IOException
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-crosstalk");
|
||||
}
|
||||
|
||||
public Object handle(Request request, Response response) throws SQLException {
|
||||
String domains = request.queryParams("domains");
|
||||
String[] parts = StringUtils.split(domains, ',');
|
||||
|
||||
if (parts.length != 2) {
|
||||
throw new IllegalArgumentException("Expected exactly two domains");
|
||||
}
|
||||
|
||||
response.type("text/html");
|
||||
|
||||
for (int i = 0; i < parts.length; i++) {
|
||||
parts[i] = parts[i].trim();
|
||||
}
|
||||
|
||||
var resAtoB = searchOperator.doLinkSearch(parts[0], parts[1]);
|
||||
var resBtoA = searchOperator.doLinkSearch(parts[1], parts[0]);
|
||||
|
||||
var model = new CrosstalkResult(parts[0], parts[1], resAtoB, resBtoA);
|
||||
|
||||
return renderer.render(model);
|
||||
}
|
||||
|
||||
|
||||
|
||||
private record CrosstalkResult(String domainA,
|
||||
String domainB,
|
||||
List<UrlDetails> forward,
|
||||
List<UrlDetails> backward)
|
||||
{
|
||||
|
||||
public boolean isFocusDomain() {
|
||||
return true; // Hack to get the search result templates behave well
|
||||
}
|
||||
public boolean hasBoth() {
|
||||
return !forward.isEmpty() && !backward.isEmpty();
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.index.api.IndexMqClient;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
public class SearchErrorPageService {
|
||||
private final IndexMqClient indexMqClient;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
|
||||
@Inject
|
||||
public SearchErrorPageService(IndexMqClient indexMqClient,
|
||||
RendererFactory rendererFactory) throws IOException {
|
||||
|
||||
renderer = rendererFactory.renderer("search/error-page-search");
|
||||
|
||||
this.indexMqClient = indexMqClient;
|
||||
}
|
||||
|
||||
public void serveError(Request request, Response rsp) {
|
||||
rsp.body(renderError(request, "Internal error",
|
||||
"""
|
||||
An error occurred when communicating with the search engine index.
|
||||
<p>
|
||||
This is hopefully a temporary state of affairs. It may be due to
|
||||
an upgrade. The index typically takes a about two or three minutes
|
||||
to reload from a cold restart. Thanks for your patience.
|
||||
"""));
|
||||
}
|
||||
|
||||
private String renderError(Request request, String title, String message) {
|
||||
return renderer.render(Map.of("title", title, "message", message,
|
||||
"profile", request.queryParamOrDefault("profile", ""),
|
||||
"js", request.queryParamOrDefault("js", ""),
|
||||
"query", request.queryParamOrDefault("query", "")
|
||||
));
|
||||
}
|
||||
}
|
@ -0,0 +1,85 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
/** Service for handling flagging sites. This code has an admin-facing correspondent in
|
||||
* DomainComplaintService in control-service
|
||||
*/
|
||||
public class SearchFlagSiteService {
|
||||
private final HikariDataSource dataSource;
|
||||
|
||||
private final CategoryItem unknownCategory = new CategoryItem("unknown", "Unknown");
|
||||
|
||||
private final List<CategoryItem> categories =
|
||||
List.of(
|
||||
new CategoryItem("spam", "Spam"),
|
||||
new CategoryItem("freebooting", "Reposting Stolen Content"),
|
||||
new CategoryItem("broken", "Broken Website"),
|
||||
new CategoryItem("shock", "Shocking/Offensive"),
|
||||
new CategoryItem("blacklist", "Review Blacklisting"),
|
||||
new CategoryItem("no-random", "Remove from Random Exploration")
|
||||
);
|
||||
|
||||
private final Map<String, CategoryItem> categoryItemMap =
|
||||
categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity()));
|
||||
@Inject
|
||||
public SearchFlagSiteService(HikariDataSource dataSource) {
|
||||
this.dataSource = dataSource;
|
||||
}
|
||||
|
||||
public List<CategoryItem> getCategories() {
|
||||
return categories;
|
||||
}
|
||||
|
||||
public List<FlagSiteComplaintModel> getExistingComplaints(int id) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var complaintsStmt = conn.prepareStatement("""
|
||||
SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION
|
||||
FROM DOMAIN_COMPLAINT
|
||||
WHERE DOMAIN_ID=?
|
||||
"""))
|
||||
{
|
||||
List<FlagSiteComplaintModel> complaints = new ArrayList<>();
|
||||
|
||||
complaintsStmt.setInt(1, id);
|
||||
ResultSet rs = complaintsStmt.executeQuery();
|
||||
|
||||
while (rs.next()) {
|
||||
complaints.add(new FlagSiteComplaintModel(
|
||||
categoryItemMap.getOrDefault(rs.getString(1), unknownCategory).categoryDesc,
|
||||
rs.getString(2),
|
||||
rs.getBoolean(3),
|
||||
rs.getString(4)));
|
||||
}
|
||||
|
||||
return complaints;
|
||||
}
|
||||
}
|
||||
|
||||
public void insertComplaint(FlagSiteFormData formData) throws SQLException {
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement(
|
||||
"""
|
||||
INSERT INTO DOMAIN_COMPLAINT(DOMAIN_ID, CATEGORY, DESCRIPTION, SAMPLE) VALUES (?, ?, ?, ?)
|
||||
""")) {
|
||||
stmt.setInt(1, formData.domainId);
|
||||
stmt.setString(2, formData.category);
|
||||
stmt.setString(3, formData.description);
|
||||
stmt.setString(4, formData.sampleQuery);
|
||||
stmt.executeUpdate();
|
||||
}
|
||||
}
|
||||
|
||||
public record CategoryItem(String categoryName, String categoryDesc) {}
|
||||
public record FlagSiteComplaintModel(String category, String submitTime, boolean isReviewed, String decision) {}
|
||||
public record FlagSiteFormData(int domainId, String category, String description, String sampleQuery) {}
|
||||
}
|
@ -0,0 +1,117 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import com.zaxxer.hikari.HikariDataSource;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.search.svc.SearchQueryCountService;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
/** Renders the front page (index) */
|
||||
@Singleton
|
||||
public class SearchFrontPageService {
|
||||
|
||||
private final MustacheRenderer<IndexModel> template;
|
||||
private final HikariDataSource dataSource;
|
||||
private final SearchQueryCountService searchVisitorCount;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchFrontPageService(RendererFactory rendererFactory,
|
||||
HikariDataSource dataSource,
|
||||
SearchQueryCountService searchVisitorCount
|
||||
) throws IOException {
|
||||
this.template = rendererFactory.renderer("search/index/index");
|
||||
this.dataSource = dataSource;
|
||||
this.searchVisitorCount = searchVisitorCount;
|
||||
}
|
||||
|
||||
public String render(Request request, Response response) {
|
||||
response.header("Cache-control", "public,max-age=3600");
|
||||
|
||||
return template.render(new IndexModel(
|
||||
getNewsItems(),
|
||||
searchVisitorCount.getQueriesPerMinute()
|
||||
));
|
||||
}
|
||||
|
||||
|
||||
private List<NewsItem> getNewsItems() {
|
||||
List<NewsItem> items = new ArrayList<>();
|
||||
|
||||
try (var conn = dataSource.getConnection();
|
||||
var stmt = conn.prepareStatement("""
|
||||
SELECT TITLE, LINK, SOURCE, LIST_DATE FROM SEARCH_NEWS_FEED ORDER BY LIST_DATE DESC
|
||||
""")) {
|
||||
|
||||
var rep = stmt.executeQuery();
|
||||
|
||||
while (rep.next()) {
|
||||
items.add(new NewsItem(
|
||||
rep.getString(1),
|
||||
rep.getString(2),
|
||||
rep.getString(3),
|
||||
rep.getDate(4).toLocalDate()));
|
||||
}
|
||||
}
|
||||
catch (SQLException ex) {
|
||||
logger.warn("Failed to fetch news items", ex);
|
||||
}
|
||||
|
||||
return items;
|
||||
}
|
||||
|
||||
public Object renderNewsFeed(Request request, Response response) {
|
||||
List<NewsItem> newsItems = getNewsItems();
|
||||
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("""
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Marginalia Search News and Mentions</title>
|
||||
<link>https://search.marginalia.nu/</link>
|
||||
<description>News and Mentions of Marginalia Search</description>
|
||||
<language>en-us</language>
|
||||
<ttl>60</ttl>
|
||||
""");
|
||||
|
||||
sb.append("<lastBuildDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</lastBuildDate>\n");
|
||||
sb.append("<pubDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
|
||||
sb.append("<ttl>60</ttl>\n");
|
||||
for (var item : newsItems) {
|
||||
sb.append("<item>\n");
|
||||
sb.append("<title>").append(item.title()).append("</title>\n");
|
||||
sb.append("<link>").append(item.url()).append("</link>\n");
|
||||
if (item.source != null) {
|
||||
sb.append("<author>").append(item.source()).append("</author>\n");
|
||||
}
|
||||
sb.append("<pubDate>").append(item.date().atStartOfDay().atZone(ZoneId.systemDefault()).format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
|
||||
sb.append("</item>\n");
|
||||
}
|
||||
sb.append("</channel>\n");
|
||||
sb.append("</rss>\n");
|
||||
|
||||
response.type("application/rss+xml");
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
private record IndexModel(List<NewsItem> news, int searchPerMinute) { }
|
||||
private record NewsItem(String title, String url, String source, LocalDate date) {}
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.google.inject.Singleton;
|
||||
import java.time.temporal.ChronoUnit;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
/** Keeps per-minute statistics of queries */
|
||||
@Singleton
|
||||
public class SearchQueryCountService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final AtomicInteger lastMinuteQueries = new AtomicInteger();
|
||||
|
||||
private final TimeUnit minute = TimeUnit.of(ChronoUnit.MINUTES);
|
||||
private volatile int queriesPerMinute;
|
||||
|
||||
public SearchQueryCountService() {
|
||||
Thread updateThread = new Thread(this::updateQueriesPerMinute,
|
||||
"SearchVisitorCountService::updateQueriesPerMinute");
|
||||
updateThread.setDaemon(true);
|
||||
updateThread.start();
|
||||
}
|
||||
|
||||
/** Retreive the number of queries performed the minute before this one */
|
||||
public int getQueriesPerMinute() {
|
||||
return queriesPerMinute;
|
||||
}
|
||||
|
||||
/** Update query statistics for presentation */
|
||||
public void registerQuery() {
|
||||
lastMinuteQueries.incrementAndGet();
|
||||
}
|
||||
|
||||
private void updateQueriesPerMinute() {
|
||||
try {
|
||||
for (;;) {
|
||||
queriesPerMinute = lastMinuteQueries.getAndSet(0);
|
||||
minute.sleep(1);
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
logger.warn("Query counter thread was interrupted");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
@ -0,0 +1,62 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.WebsiteUrl;
|
||||
import nu.marginalia.search.command.CommandEvaluator;
|
||||
import nu.marginalia.search.command.SearchParameters;
|
||||
import nu.marginalia.search.exceptions.RedirectException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
public class SearchQueryService {
|
||||
|
||||
private final WebsiteUrl websiteUrl;
|
||||
private final SearchErrorPageService errorPageService;
|
||||
private final CommandEvaluator searchCommandEvaulator;
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
@Inject
|
||||
public SearchQueryService(
|
||||
WebsiteUrl websiteUrl,
|
||||
SearchErrorPageService errorPageService,
|
||||
CommandEvaluator searchCommandEvaulator) {
|
||||
this.websiteUrl = websiteUrl;
|
||||
this.errorPageService = errorPageService;
|
||||
this.searchCommandEvaulator = searchCommandEvaulator;
|
||||
}
|
||||
|
||||
public Object pathSearch(Request request, Response response) {
|
||||
try {
|
||||
return searchCommandEvaulator.eval(response, parseParameters(request));
|
||||
}
|
||||
catch (RedirectException ex) {
|
||||
response.redirect(ex.newUrl);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error", ex);
|
||||
errorPageService.serveError(request, response);
|
||||
}
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
private SearchParameters parseParameters(Request request) {
|
||||
try {
|
||||
final String queryParam = request.queryParams("query");
|
||||
|
||||
if (null == queryParam || queryParam.isBlank()) {
|
||||
throw new RedirectException(websiteUrl.url());
|
||||
}
|
||||
|
||||
return new SearchParameters(queryParam.trim(), request);
|
||||
}
|
||||
catch (Exception ex) {
|
||||
// Bots keep sending bad requests, suppress the error otherwise it will
|
||||
// fill up the logs.
|
||||
|
||||
throw new RedirectException(websiteUrl.url());
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,416 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.domains.DomainInfoClient;
|
||||
import nu.marginalia.api.domains.model.DomainInformation;
|
||||
import nu.marginalia.api.domains.model.SimilarDomain;
|
||||
import nu.marginalia.api.feeds.FeedsClient;
|
||||
import nu.marginalia.api.feeds.RpcFeed;
|
||||
import nu.marginalia.api.feeds.RpcFeedItem;
|
||||
import nu.marginalia.api.livecapture.LiveCaptureClient;
|
||||
import nu.marginalia.db.DbDomainQueries;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.renderer.MustacheRenderer;
|
||||
import nu.marginalia.renderer.RendererFactory;
|
||||
import nu.marginalia.screenshot.ScreenshotService;
|
||||
import nu.marginalia.search.SearchOperator;
|
||||
import nu.marginalia.search.model.UrlDetails;
|
||||
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import spark.Request;
|
||||
import spark.Response;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
|
||||
public class SearchSiteInfoService {
|
||||
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
|
||||
|
||||
private final SearchOperator searchOperator;
|
||||
private final DomainInfoClient domainInfoClient;
|
||||
private final SearchFlagSiteService flagSiteService;
|
||||
private final DbDomainQueries domainQueries;
|
||||
private final MustacheRenderer<Object> renderer;
|
||||
private final FeedsClient feedsClient;
|
||||
private final LiveCaptureClient liveCaptureClient;
|
||||
private final ScreenshotService screenshotService;
|
||||
|
||||
@Inject
|
||||
public SearchSiteInfoService(SearchOperator searchOperator,
|
||||
DomainInfoClient domainInfoClient,
|
||||
RendererFactory rendererFactory,
|
||||
SearchFlagSiteService flagSiteService,
|
||||
DbDomainQueries domainQueries,
|
||||
FeedsClient feedsClient,
|
||||
LiveCaptureClient liveCaptureClient,
|
||||
ScreenshotService screenshotService) throws IOException
|
||||
{
|
||||
this.searchOperator = searchOperator;
|
||||
this.domainInfoClient = domainInfoClient;
|
||||
this.flagSiteService = flagSiteService;
|
||||
this.domainQueries = domainQueries;
|
||||
|
||||
this.renderer = rendererFactory.renderer("search/site-info/site-info");
|
||||
|
||||
this.feedsClient = feedsClient;
|
||||
this.liveCaptureClient = liveCaptureClient;
|
||||
this.screenshotService = screenshotService;
|
||||
}
|
||||
|
||||
public Object handle(Request request, Response response) throws SQLException {
|
||||
String domainName = request.params("site");
|
||||
String view = request.queryParamOrDefault("view", "info");
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var model = switch (view) {
|
||||
case "links" -> listLinks(domainName);
|
||||
case "docs" -> listDocs(domainName);
|
||||
case "info" -> listInfo(domainName);
|
||||
case "report" -> reportSite(domainName);
|
||||
default -> listInfo(domainName);
|
||||
};
|
||||
|
||||
return renderer.render(model);
|
||||
}
|
||||
|
||||
public Object handlePost(Request request, Response response) throws SQLException {
|
||||
String domainName = request.params("site");
|
||||
String view = request.queryParamOrDefault("view", "info");
|
||||
|
||||
if (null == domainName || domainName.isBlank()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (!view.equals("report"))
|
||||
return null;
|
||||
|
||||
final int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
|
||||
FlagSiteFormData formData = new FlagSiteFormData(
|
||||
domainId,
|
||||
request.queryParams("category"),
|
||||
request.queryParams("description"),
|
||||
request.queryParams("sampleQuery")
|
||||
);
|
||||
flagSiteService.insertComplaint(formData);
|
||||
|
||||
var complaints = flagSiteService.getExistingComplaints(domainId);
|
||||
|
||||
var model = new ReportDomain(domainName, domainId, complaints, List.of(), true);
|
||||
|
||||
return renderer.render(model);
|
||||
}
|
||||
|
||||
private Object reportSite(String domainName) throws SQLException {
|
||||
int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
|
||||
var existingComplaints = flagSiteService.getExistingComplaints(domainId);
|
||||
|
||||
return new ReportDomain(domainName,
|
||||
domainId,
|
||||
existingComplaints,
|
||||
flagSiteService.getCategories(),
|
||||
false);
|
||||
}
|
||||
|
||||
|
||||
private Backlinks listLinks(String domainName) {
|
||||
return new Backlinks(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
searchOperator.doBacklinkSearch(domainName));
|
||||
}
|
||||
|
||||
private SiteInfoWithContext listInfo(String domainName) {
|
||||
|
||||
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
|
||||
final Future<DomainInformation> domainInfoFuture;
|
||||
final Future<List<SimilarDomain>> similarSetFuture;
|
||||
final Future<List<SimilarDomain>> linkingDomainsFuture;
|
||||
final CompletableFuture<RpcFeed> feedItemsFuture;
|
||||
String url = "https://" + domainName + "/";
|
||||
|
||||
boolean hasScreenshot = screenshotService.hasScreenshot(domainId);
|
||||
|
||||
|
||||
if (domainId < 0) {
|
||||
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
similarSetFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
|
||||
}
|
||||
else if (!domainInfoClient.isAccepting()) {
|
||||
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
similarSetFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
|
||||
}
|
||||
else {
|
||||
domainInfoFuture = domainInfoClient.domainInformation(domainId);
|
||||
similarSetFuture = domainInfoClient.similarDomains(domainId, 25);
|
||||
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
|
||||
feedItemsFuture = feedsClient.getFeed(domainId);
|
||||
}
|
||||
|
||||
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
|
||||
if (!sampleResults.isEmpty()) {
|
||||
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
|
||||
}
|
||||
|
||||
var result = new SiteInfoWithContext(domainName,
|
||||
domainId,
|
||||
url,
|
||||
hasScreenshot,
|
||||
waitForFuture(domainInfoFuture, () -> createDummySiteInfo(domainName)),
|
||||
waitForFuture(similarSetFuture, List::of),
|
||||
waitForFuture(linkingDomainsFuture, List::of),
|
||||
waitForFuture(feedItemsFuture.thenApply(FeedItems::new), () -> FeedItems.dummyValue(domainName)),
|
||||
sampleResults
|
||||
);
|
||||
|
||||
requestMissingScreenshots(result);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/** Request missing screenshots for the given site info */
|
||||
private void requestMissingScreenshots(SiteInfoWithContext result) {
|
||||
|
||||
// Always request the main site screenshot, even if we already have it
|
||||
// as this will make the live-capture do a staleness check and update
|
||||
// as needed.
|
||||
liveCaptureClient.requestScreengrab(result.domainId());
|
||||
|
||||
int requests = 1;
|
||||
|
||||
// Request screenshots for similar and linking domains only if they are absent
|
||||
// also throttle the requests to at most 5 per view.
|
||||
|
||||
if (result.similar() != null) {
|
||||
for (var similar : result.similar()) {
|
||||
if (similar.screenshot()) {
|
||||
continue;
|
||||
}
|
||||
if (++requests > 5) {
|
||||
break;
|
||||
}
|
||||
|
||||
liveCaptureClient.requestScreengrab(similar.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
if (result.linking() != null) {
|
||||
for (var linking : result.linking()) {
|
||||
if (linking.screenshot()) {
|
||||
continue;
|
||||
}
|
||||
if (++requests > 5) {
|
||||
break;
|
||||
}
|
||||
|
||||
liveCaptureClient.requestScreengrab(linking.domainId());
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) {
|
||||
try {
|
||||
return future.get(250, TimeUnit.MILLISECONDS);
|
||||
} catch (Exception e) {
|
||||
logger.info("Failed to get domain data: {}", e.getMessage());
|
||||
return fallback.get();
|
||||
}
|
||||
}
|
||||
|
||||
private DomainInformation createDummySiteInfo(String domainName) {
|
||||
return DomainInformation.builder()
|
||||
.domain(new EdgeDomain(domainName))
|
||||
.suggestForCrawling(true)
|
||||
.unknownDomain(true)
|
||||
.build();
|
||||
}
|
||||
|
||||
private Docs listDocs(String domainName) {
|
||||
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
|
||||
return new Docs(domainName,
|
||||
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
|
||||
searchOperator.doSiteSearch(domainName, domainId, 100));
|
||||
}
|
||||
|
||||
public record Docs(Map<String, Boolean> view,
|
||||
String domain,
|
||||
long domainId,
|
||||
List<UrlDetails> results) {
|
||||
public Docs(String domain, long domainId, List<UrlDetails> results) {
|
||||
this(Map.of("docs", true), domain, domainId, results);
|
||||
}
|
||||
|
||||
public String focusDomain() { return domain; }
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record Backlinks(Map<String, Boolean> view, String domain, long domainId, List<UrlDetails> results) {
|
||||
public Backlinks(String domain, long domainId, List<UrlDetails> results) {
|
||||
this(Map.of("links", true), domain, domainId, results);
|
||||
}
|
||||
|
||||
public String query() { return "links:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record SiteInfoWithContext(Map<String, Boolean> view,
|
||||
Map<String, Boolean> domainState,
|
||||
String domain,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking,
|
||||
FeedItems feed,
|
||||
List<UrlDetails> samples
|
||||
) {
|
||||
public SiteInfoWithContext(String domain,
|
||||
int domainId,
|
||||
String siteUrl,
|
||||
boolean hasScreenshot,
|
||||
DomainInformation domainInformation,
|
||||
List<SimilarDomain> similar,
|
||||
List<SimilarDomain> linking,
|
||||
FeedItems feedInfo,
|
||||
List<UrlDetails> samples
|
||||
)
|
||||
{
|
||||
this(Map.of("info", true),
|
||||
Map.of(domainInfoState(domainInformation), true),
|
||||
domain,
|
||||
domainId,
|
||||
siteUrl,
|
||||
hasScreenshot,
|
||||
domainInformation,
|
||||
similar,
|
||||
linking,
|
||||
feedInfo,
|
||||
samples);
|
||||
}
|
||||
|
||||
public String getLayout() {
|
||||
// My CSS is too weak to handle this in CSS alone, so I guess we're doing layout in Java...
|
||||
if (similar != null && similar.size() < 25) {
|
||||
return "lopsided";
|
||||
}
|
||||
else if (feed != null && !feed.items().isEmpty()) {
|
||||
return "lopsided";
|
||||
}
|
||||
else if (samples != null && !samples.isEmpty()) {
|
||||
return "lopsided";
|
||||
}
|
||||
else {
|
||||
return "balanced";
|
||||
}
|
||||
}
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
private static String domainInfoState(DomainInformation info) {
|
||||
if (info.isBlacklisted()) {
|
||||
return "blacklisted";
|
||||
}
|
||||
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
|
||||
return "suggestForCrawling";
|
||||
}
|
||||
if (info.isInCrawlQueue()) {
|
||||
return "inCrawlQueue";
|
||||
}
|
||||
if (info.isUnknownDomain()) {
|
||||
return "unknownDomain";
|
||||
}
|
||||
else {
|
||||
return "indexed";
|
||||
}
|
||||
}
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
public record FeedItem(String title, String date, String description, String url) {
|
||||
|
||||
public FeedItem(RpcFeedItem rpcFeedItem) {
|
||||
this(rpcFeedItem.getTitle(),
|
||||
rpcFeedItem.getDate(),
|
||||
rpcFeedItem.getDescription(),
|
||||
rpcFeedItem.getUrl());
|
||||
}
|
||||
|
||||
public String pubDay() { // Extract the date from an ISO style date string
|
||||
if (date.length() > 10) {
|
||||
return date.substring(0, 10);
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
public String descriptionSafe() {
|
||||
return description
|
||||
.replace("<", "<")
|
||||
.replace(">", ">");
|
||||
}
|
||||
}
|
||||
|
||||
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
|
||||
|
||||
public static FeedItems dummyValue(String domain) {
|
||||
return new FeedItems(domain, "", "", List.of());
|
||||
}
|
||||
|
||||
public FeedItems(RpcFeed rpcFeedItems) {
|
||||
this(rpcFeedItems.getDomain(),
|
||||
rpcFeedItems.getFeedUrl(),
|
||||
rpcFeedItems.getUpdated(),
|
||||
rpcFeedItems.getItemsList().stream().map(FeedItem::new).toList());
|
||||
}
|
||||
}
|
||||
|
||||
public record ReportDomain(
|
||||
Map<String, Boolean> view,
|
||||
String domain,
|
||||
int domainId,
|
||||
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
|
||||
List<SearchFlagSiteService.CategoryItem> category,
|
||||
boolean submitted)
|
||||
{
|
||||
public ReportDomain(String domain,
|
||||
int domainId,
|
||||
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
|
||||
List<SearchFlagSiteService.CategoryItem> category,
|
||||
boolean submitted) {
|
||||
this(Map.of("report", true), domain, domainId, complaints, category, submitted);
|
||||
}
|
||||
|
||||
public String query() { return "site:" + domain; }
|
||||
|
||||
public boolean isKnown() {
|
||||
return domainId > 0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,73 @@
|
||||
package nu.marginalia.search.svc;
|
||||
|
||||
import nu.marginalia.api.math.MathClient;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import javax.annotation.CheckForNull;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
import java.util.concurrent.Future;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@Singleton
|
||||
public class SearchUnitConversionService {
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)");
|
||||
private final Predicate<String> evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate();
|
||||
|
||||
private final MathClient mathClient;
|
||||
|
||||
@Inject
|
||||
public SearchUnitConversionService(MathClient mathClient) {
|
||||
this.mathClient = mathClient;
|
||||
}
|
||||
|
||||
public Optional<String> tryConversion(String query) {
|
||||
var matcher = conversionPattern.matcher(query);
|
||||
if (!matcher.matches())
|
||||
return Optional.empty();
|
||||
|
||||
String value = matcher.group(1);
|
||||
String from = matcher.group(3);
|
||||
String to = matcher.group(4);
|
||||
|
||||
logger.info("{} -> '{}' '{}' '{}'", query, value, from, to);
|
||||
|
||||
try {
|
||||
var resultFuture = mathClient.unitConversion(value, from, to);
|
||||
return Optional.of(
|
||||
resultFuture.get(250, TimeUnit.MILLISECONDS)
|
||||
);
|
||||
} catch (ExecutionException e) {
|
||||
logger.error("Error in unit conversion", e);
|
||||
} catch (InterruptedException e) {
|
||||
logger.error("Interrupted while waiting for unit conversion", e);
|
||||
} catch (TimeoutException e) {
|
||||
// Ignore
|
||||
}
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
|
||||
public @CheckForNull Future<String> tryEval(String query) {
|
||||
if (!evalPredicate.test(query)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
var expr = query.toLowerCase().trim();
|
||||
|
||||
if (expr.chars().allMatch(Character::isDigit)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.info("eval({})", expr);
|
||||
|
||||
return mathClient.evalMath(expr);
|
||||
}
|
||||
}
|
@ -0,0 +1,3 @@
|
||||
# Search Service
|
||||
|
||||
This is the old search service that search traffic with the old GUI.
|
Before Width: | Height: | Size: 1.2 KiB After Width: | Height: | Size: 1.2 KiB |
Before Width: | Height: | Size: 891 B After Width: | Height: | Size: 891 B |
@ -280,6 +280,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
true,
|
||||
SimilarDomain.LinkType.FOWARD
|
||||
));
|
||||
dummyLinks.add(new SimilarDomain(
|
||||
@ -290,6 +291,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
||||
false,
|
||||
false,
|
||||
true,
|
||||
true,
|
||||
SimilarDomain.LinkType.BACKWARD
|
||||
));
|
||||
dummyLinks.add(new SimilarDomain(
|
||||
@ -300,6 +302,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
false,
|
||||
SimilarDomain.LinkType.BIDIRECTIONAL
|
||||
));
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user