Merge pull request #127 from MarginaliaSearch/serp-redesign

Web UI redesign
This commit is contained in:
Viktor 2025-01-06 16:08:14 +01:00 committed by GitHub
commit be6382e0d0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
192 changed files with 7750 additions and 628 deletions

1
.gitignore vendored
View File

@ -7,3 +7,4 @@ build/
lombok.config
Dockerfile
run
jte-classes

View File

@ -48,6 +48,7 @@ ext {
dockerImageTag='latest'
dockerImageRegistry='marginalia'
jibVersion = '3.4.3'
}
idea {

View File

@ -28,7 +28,7 @@ public class DbDomainQueries {
}
public Integer getDomainId(EdgeDomain domain) {
public Integer getDomainId(EdgeDomain domain) throws NoSuchElementException {
try (var connection = dataSource.getConnection()) {
return domainIdCache.get(domain, () -> {
@ -42,6 +42,9 @@ public class DbDomainQueries {
throw new NoSuchElementException();
});
}
catch (UncheckedExecutionException ex) {
throw new NoSuchElementException();
}
catch (ExecutionException ex) {
throw new RuntimeException(ex.getCause());
}

View File

@ -42,6 +42,12 @@ dependencies {
implementation libs.bundles.curator
implementation libs.bundles.flyway
libs.bundles.jooby.get().each {
implementation dependencies.create(it) {
exclude group: 'org.slf4j'
}
}
testImplementation libs.bundles.slf4j.test
implementation libs.bundles.mariadb

View File

@ -0,0 +1,178 @@
package nu.marginalia.service.server;
import io.jooby.*;
import io.prometheus.client.Counter;
import nu.marginalia.mq.inbox.MqInboxIf;
import nu.marginalia.service.client.ServiceNotAvailableException;
import nu.marginalia.service.discovery.property.ServiceEndpoint;
import nu.marginalia.service.discovery.property.ServiceKey;
import nu.marginalia.service.discovery.property.ServicePartition;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.server.jte.JteModule;
import nu.marginalia.service.server.mq.ServiceMqSubscription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
public class JoobyService {
private final Logger logger = LoggerFactory.getLogger(getClass());
// Marker for filtering out sensitive content from the persistent logs
private final Marker httpMarker = MarkerFactory.getMarker("HTTP");
private final Initialization initialization;
private final static Counter request_counter = Counter.build("wmsa_request_counter", "Request Counter")
.labelNames("service", "node")
.register();
private final static Counter request_counter_good = Counter.build("wmsa_request_counter_good", "Good Requests")
.labelNames("service", "node")
.register();
private final static Counter request_counter_bad = Counter.build("wmsa_request_counter_bad", "Bad Requests")
.labelNames("service", "node")
.register();
private final static Counter request_counter_err = Counter.build("wmsa_request_counter_err", "Error Requests")
.labelNames("service", "node")
.register();
private final String serviceName;
private static volatile boolean initialized = false;
protected final MqInboxIf messageQueueInbox;
private final int node;
private GrpcServer grpcServer;
private ServiceConfiguration config;
private final List<MvcExtension> joobyServices;
private final ServiceEndpoint restEndpoint;
public JoobyService(BaseServiceParams params,
ServicePartition partition,
List<DiscoverableService> grpcServices,
List<MvcExtension> joobyServices
) throws Exception {
this.joobyServices = joobyServices;
this.initialization = params.initialization;
config = params.configuration;
node = config.node();
String inboxName = config.serviceName();
logger.info("Inbox name: {}", inboxName);
var serviceRegistry = params.serviceRegistry;
restEndpoint = serviceRegistry.registerService(ServiceKey.forRest(config.serviceId(), config.node()),
config.instanceUuid(), config.externalAddress());
var mqInboxFactory = params.messageQueueInboxFactory;
messageQueueInbox = mqInboxFactory.createSynchronousInbox(inboxName, config.node(), config.instanceUuid());
messageQueueInbox.subscribe(new ServiceMqSubscription(this));
serviceName = System.getProperty("service-name");
initialization.addCallback(params.heartbeat::start);
initialization.addCallback(messageQueueInbox::start);
initialization.addCallback(() -> params.eventLog.logEvent("SVC-INIT", serviceName + ":" + config.node()));
initialization.addCallback(() -> serviceRegistry.announceInstance(config.instanceUuid()));
Thread.setDefaultUncaughtExceptionHandler((t, e) -> {
if (e instanceof ServiceNotAvailableException) {
// reduce log spam for this common case
logger.error("Service not available: {}", e.getMessage());
}
else {
logger.error("Uncaught exception", e);
}
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
});
if (!initialization.isReady() && ! initialized ) {
initialized = true;
grpcServer = new GrpcServer(config, serviceRegistry, partition, grpcServices);
grpcServer.start();
}
}
public void startJooby(Jooby jooby) {
logger.info("{} Listening to {}:{} ({})", getClass().getSimpleName(),
restEndpoint.host(),
restEndpoint.port(),
config.externalAddress());
// FIXME: This won't work outside of docker, may need to submit a PR to jooby to allow classpaths here
jooby.install(new JteModule(Path.of("/app/resources/jte"), Path.of("/app/classes/jte-precompiled")));
jooby.assets("/*", Paths.get("/app/resources/static"));
var options = new ServerOptions();
options.setHost(config.bindAddress());
options.setPort(restEndpoint.port());
// Enable gzip compression of response data, but set compression to the lowest level
// since it doesn't really save much more space to dial it up. It's typically a
// single digit percentage difference since HTML already compresses very well with level = 1.
options.setCompressionLevel(1);
jooby.setServerOptions(options);
jooby.get("/internal/ping", ctx -> "pong");
jooby.get("/internal/started", this::isInitialized);
jooby.get("/internal/ready", this::isReady);
for (var service : joobyServices) {
jooby.mvc(service);
}
jooby.before(this::auditRequestIn);
jooby.after(this::auditRequestOut);
}
private Object isInitialized(Context ctx) {
if (initialization.isReady()) {
return "ok";
}
else {
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
return "bad";
}
}
public boolean isReady() {
return true;
}
private String isReady(Context ctx) {
if (isReady()) {
return "ok";
}
else {
ctx.setResponseCode(StatusCode.FAILED_DEPENDENCY_CODE);
return "bad";
}
}
private void auditRequestIn(Context ctx) {
request_counter.labels(serviceName, Integer.toString(node)).inc();
}
private void auditRequestOut(Context ctx, Object result, Throwable failure) {
if (ctx.getResponseCode().value() < 400) {
request_counter_good.labels(serviceName, Integer.toString(node)).inc();
}
else {
request_counter_bad.labels(serviceName, Integer.toString(node)).inc();
}
if (failure != null) {
logger.error("Request failed " + ctx.getMethod() + " " + ctx.getRequestURL(), failure);
request_counter_err.labels(serviceName, Integer.toString(node)).inc();
}
}
}

View File

@ -16,7 +16,7 @@ import spark.Spark;
import java.util.List;
public class Service {
public class SparkService {
private final Logger logger = LoggerFactory.getLogger(getClass());
// Marker for filtering out sensitive content from the persistent logs
@ -43,10 +43,10 @@ public class Service {
private final int node;
private GrpcServer grpcServer;
public Service(BaseServiceParams params,
Runnable configureStaticFiles,
ServicePartition partition,
List<DiscoverableService> grpcServices) throws Exception {
public SparkService(BaseServiceParams params,
Runnable configureStaticFiles,
ServicePartition partition,
List<DiscoverableService> grpcServices) throws Exception {
this.initialization = params.initialization;
var config = params.configuration;
@ -126,18 +126,18 @@ public class Service {
}
}
public Service(BaseServiceParams params,
ServicePartition partition,
List<DiscoverableService> grpcServices) throws Exception {
public SparkService(BaseServiceParams params,
ServicePartition partition,
List<DiscoverableService> grpcServices) throws Exception {
this(params,
Service::defaultSparkConfig,
SparkService::defaultSparkConfig,
partition,
grpcServices);
}
public Service(BaseServiceParams params) throws Exception {
public SparkService(BaseServiceParams params) throws Exception {
this(params,
Service::defaultSparkConfig,
SparkService::defaultSparkConfig,
ServicePartition.any(),
List.of());
}

View File

@ -0,0 +1,61 @@
package nu.marginalia.service.server.jte;
import edu.umd.cs.findbugs.annotations.NonNull;
import edu.umd.cs.findbugs.annotations.Nullable;
import gg.jte.ContentType;
import gg.jte.TemplateEngine;
import gg.jte.resolve.DirectoryCodeResolver;
import io.jooby.*;
import java.io.File;
import java.nio.file.Path;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Stream;
// Temporary workaround for a bug
// APL-2.0 https://github.com/jooby-project/jooby
public class JteModule implements Extension {
private Path sourceDirectory;
private Path classDirectory;
private TemplateEngine templateEngine;
public JteModule(@NonNull Path sourceDirectory, @NonNull Path classDirectory) {
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
this.classDirectory = (Path)Objects.requireNonNull(classDirectory, "Class directory is required.");
}
public JteModule(@NonNull Path sourceDirectory) {
this.sourceDirectory = (Path)Objects.requireNonNull(sourceDirectory, "Source directory is required.");
}
public JteModule(@NonNull TemplateEngine templateEngine) {
this.templateEngine = (TemplateEngine)Objects.requireNonNull(templateEngine, "Template engine is required.");
}
public void install(@NonNull Jooby application) {
if (this.templateEngine == null) {
this.templateEngine = create(application.getEnvironment(), this.sourceDirectory, this.classDirectory);
}
ServiceRegistry services = application.getServices();
services.put(TemplateEngine.class, this.templateEngine);
application.encoder(MediaType.html, new JteTemplateEngine(this.templateEngine));
}
public static TemplateEngine create(@NonNull Environment environment, @NonNull Path sourceDirectory, @Nullable Path classDirectory) {
boolean dev = environment.isActive("dev", new String[]{"test"});
if (dev) {
Objects.requireNonNull(sourceDirectory, "Source directory is required.");
Path requiredClassDirectory = (Path)Optional.ofNullable(classDirectory).orElseGet(() -> sourceDirectory.resolve("jte-classes"));
TemplateEngine engine = TemplateEngine.create(new DirectoryCodeResolver(sourceDirectory), requiredClassDirectory, ContentType.Html, environment.getClassLoader());
Optional<List<String>> var10000 = Optional.ofNullable(System.getProperty("jooby.run.classpath")).map((it) -> it.split(File.pathSeparator)).map(Stream::of).map(Stream::toList);
Objects.requireNonNull(engine);
var10000.ifPresent(engine::setClassPath);
return engine;
} else {
return classDirectory == null ? TemplateEngine.createPrecompiled(ContentType.Html) : TemplateEngine.createPrecompiled(classDirectory, ContentType.Html);
}
}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.service.server.jte;
import edu.umd.cs.findbugs.annotations.NonNull;
import gg.jte.TemplateEngine;
import io.jooby.Context;
import io.jooby.MapModelAndView;
import io.jooby.ModelAndView;
import io.jooby.buffer.DataBuffer;
import io.jooby.internal.jte.DataBufferOutput;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.List;
// Temporary workaround for a bug
// APL-2.0 https://github.com/jooby-project/jooby
class JteTemplateEngine implements io.jooby.TemplateEngine {
private final TemplateEngine jte;
private final List<String> extensions;
public JteTemplateEngine(TemplateEngine jte) {
this.jte = jte;
this.extensions = List.of(".jte", ".kte");
}
@NonNull @Override
public List<String> extensions() {
return extensions;
}
@Override
public DataBuffer render(Context ctx, ModelAndView modelAndView) {
var buffer = ctx.getBufferFactory().allocateBuffer();
var output = new DataBufferOutput(buffer, StandardCharsets.UTF_8);
var attributes = ctx.getAttributes();
if (modelAndView instanceof MapModelAndView mapModelAndView) {
var mapModel = new HashMap<String, Object>();
mapModel.putAll(attributes);
mapModel.putAll(mapModelAndView.getModel());
jte.render(modelAndView.getView(), mapModel, output);
} else {
jte.render(modelAndView.getView(), modelAndView.getModel(), output);
}
return buffer;
}
}

View File

@ -3,7 +3,6 @@ package nu.marginalia.service.server.mq;
import nu.marginalia.mq.MqMessage;
import nu.marginalia.mq.inbox.MqInboxResponse;
import nu.marginalia.mq.inbox.MqSubscription;
import nu.marginalia.service.server.Service;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -15,10 +14,10 @@ import java.util.Map;
public class ServiceMqSubscription implements MqSubscription {
private static final Logger logger = LoggerFactory.getLogger(ServiceMqSubscription.class);
private final Map<String, Method> requests = new HashMap<>();
private final Service service;
private final Object service;
public ServiceMqSubscription(Service service) {
public ServiceMqSubscription(Object service) {
this.service = service;
/* Wire up all methods annotated with @MqRequest and @MqNotification

View File

@ -6,4 +6,8 @@ public record BrowseResultSet(Collection<BrowseResult> results, String focusDoma
public BrowseResultSet(Collection<BrowseResult> results) {
this(results, "");
}
public boolean hasFocusDomain() {
return focusDomain != null && !focusDomain.isBlank();
}
}

View File

@ -38,6 +38,7 @@ public class DomainsProtobufCodec {
sd.getIndexed(),
sd.getActive(),
sd.getScreenshot(),
sd.getFeed(),
SimilarDomain.LinkType.valueOf(sd.getLinkType().name())
);
}

View File

@ -71,6 +71,23 @@ public class DomainInformation {
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
}
public String getAsnFlag() {
if (asnCountry == null || asnCountry.codePointCount(0, asnCountry.length()) != 2) {
return "";
}
String country = asnCountry;
if ("UK".equals(country)) {
country = "GB";
}
int offset = 0x1F1E6;
int asciiOffset = 0x41;
int firstChar = Character.codePointAt(country, 0) - asciiOffset + offset;
int secondChar = Character.codePointAt(country, 1) - asciiOffset + offset;
return new String(Character.toChars(firstChar)) + new String(Character.toChars(secondChar));
}
public EdgeDomain getDomain() {
return this.domain;
}

View File

@ -9,6 +9,7 @@ public record SimilarDomain(EdgeUrl url,
boolean indexed,
boolean active,
boolean screenshot,
boolean feed,
LinkType linkType) {
public String getRankSymbols() {
@ -52,12 +53,12 @@ public record SimilarDomain(EdgeUrl url,
return NONE;
}
public String toString() {
public String faIcon() {
return switch (this) {
case FOWARD -> "&#8594;";
case BACKWARD -> "&#8592;";
case BIDIRECTIONAL -> "&#8646;";
case NONE -> "-";
case FOWARD -> "fa-solid fa-arrow-right";
case BACKWARD -> "fa-solid fa-arrow-left";
case BIDIRECTIONAL -> "fa-solid fa-arrow-right-arrow-left";
case NONE -> "";
};
}

View File

@ -7,4 +7,8 @@ public record DictionaryResponse(String word, List<DictionaryEntry> entries) {
this.word = word;
this.entries = entries.stream().toList(); // Make an immutable copy
}
public boolean hasEntries() {
return !entries.isEmpty();
}
}

View File

@ -11,7 +11,7 @@ import nu.marginalia.api.svc.RateLimiterService;
import nu.marginalia.api.svc.ResponseCache;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.Service;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.mq.MqRequest;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -21,7 +21,7 @@ import spark.Request;
import spark.Response;
import spark.Spark;
public class ApiService extends Service {
public class ApiService extends SparkService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Gson gson = GsonFactory.get();
@ -69,7 +69,7 @@ public class ApiService extends Service {
this.searchOperator = searchOperator;
Spark.get("/api/", (rq, rsp) -> {
rsp.redirect("https://memex.marginalia.nu/projects/edge/api.gmi");
rsp.redirect("https://about.marginalia-search.com/article/api/");
return "";
});

View File

@ -9,7 +9,7 @@ import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.Service;
import nu.marginalia.service.server.SparkService;
import org.jetbrains.annotations.NotNull;
import spark.Request;
import spark.Response;
@ -18,7 +18,7 @@ import spark.Spark;
import java.util.Map;
import java.util.Optional;
public class DatingService extends Service {
public class DatingService extends SparkService {
private final DomainBlacklist blacklist;
private final DbBrowseDomainsSimilarCosine browseSimilarCosine;
private final DbBrowseDomainsRandom browseRandom;

View File

@ -5,7 +5,7 @@ import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.Service;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.StaticResources;
import org.jetbrains.annotations.NotNull;
import spark.Request;
@ -15,7 +15,7 @@ import spark.Spark;
import java.sql.SQLException;
import java.util.*;
public class ExplorerService extends Service {
public class ExplorerService extends SparkService {
private final MustacheRenderer<Object> renderer;
private final HikariDataSource dataSource;

View File

@ -0,0 +1,94 @@
plugins {
id 'java'
id 'io.freefair.sass-base' version '8.4'
id 'io.freefair.sass-java' version '8.4'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.3'
}
application {
mainClass = 'nu.marginalia.search.SearchMain'
applicationName = 'search-service-legacy'
}
tasks.distZip.enabled = false
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
sass {
sourceMapEnabled = true
sourceMapEmbed = true
outputStyle = EXPANDED
}
apply from: "$rootProject.projectDir/srcsets.gradle"
apply from: "$rootProject.projectDir/docker.gradle"
dependencies {
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:index:query')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:math:api')
implementation project(':code:functions:domain-info:api')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:api')
implementation project(':code:common:renderer')
implementation project(':code:features-search:screenshots')
implementation project(':code:features-search:random-websites')
implementation libs.bundles.slf4j
implementation libs.roaringbitmap
implementation libs.prometheus
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.handlebars
implementation dependencies.create(libs.spark.get()) {
exclude group: 'org.eclipse.jetty'
}
implementation libs.bundles.jetty
implementation libs.opencsv
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.gson
implementation libs.bundles.mariadb
implementation libs.bundles.nlp
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}
tasks.register('paperDoll', Test) {
useJUnitPlatform {
includeTags "paperdoll"
}
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
}

View File

@ -0,0 +1,47 @@
package nu.marginalia.search;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.server.Initialization;
import spark.Spark;
public class SearchMain extends MainClass {
private final SearchService service;
@Inject
public SearchMain(SearchService service) {
this.service = service;
}
public static void main(String... args) {
init(ServiceId.Search, args);
Spark.staticFileLocation("/static/search/");
Injector injector = Guice.createInjector(
new SearchModule(),
new ServiceConfigurationModule(ServiceId.Search),
new ServiceDiscoveryModule(),
new DatabaseModule(false)
);
// Orchestrate the boot order for the services
var registry = injector.getInstance(ServiceRegistryIf.class);
var configuration = injector.getInstance(ServiceConfiguration.class);
orchestrateBoot(registry, configuration);
injector.getInstance(SearchMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@ -0,0 +1,20 @@
package nu.marginalia.search;
import com.google.inject.AbstractModule;
import nu.marginalia.LanguageModels;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.WmsaHome;
import nu.marginalia.renderer.config.HandlebarsConfigurator;
public class SearchModule extends AbstractModule {
public void configure() {
bind(HandlebarsConfigurator.class).to(SearchHandlebarsConfigurator.class);
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
System.getProperty("search.websiteUrl", "https://search.marginalia.nu/")));
}
}

View File

@ -0,0 +1,266 @@
package nu.marginalia.search;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.math.MathClient;
import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.model.SearchFilters;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.UrlDeduplicator;
import nu.marginalia.search.svc.SearchQueryCountService;
import nu.marginalia.search.svc.SearchUnitConversionService;
import org.apache.logging.log4j.util.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import javax.annotation.Nullable;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@Singleton
public class SearchOperator {
private static final Logger logger = LoggerFactory.getLogger(SearchOperator.class);
// Marker for filtering out sensitive content from the persistent logs
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
private final MathClient mathClient;
private final DbDomainQueries domainQueries;
private final QueryClient queryClient;
private final SearchQueryParamFactory paramFactory;
private final WebsiteUrl websiteUrl;
private final SearchUnitConversionService searchUnitConversionService;
private final SearchQueryCountService searchVisitorCount;
@Inject
public SearchOperator(MathClient mathClient,
DbDomainQueries domainQueries,
QueryClient queryClient,
SearchQueryParamFactory paramFactory,
WebsiteUrl websiteUrl,
SearchUnitConversionService searchUnitConversionService,
SearchQueryCountService searchVisitorCount
)
{
this.mathClient = mathClient;
this.domainQueries = domainQueries;
this.queryClient = queryClient;
this.paramFactory = paramFactory;
this.websiteUrl = websiteUrl;
this.searchUnitConversionService = searchUnitConversionService;
this.searchVisitorCount = searchVisitorCount;
}
public List<UrlDetails> doSiteSearch(String domain,
int domainId,
int count) {
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
var queryResponse = queryClient.search(queryParams);
return getResultsFromQuery(queryResponse);
}
public List<UrlDetails> doBacklinkSearch(String domain) {
var queryParams = paramFactory.forBacklinkSearch(domain);
var queryResponse = queryClient.search(queryParams);
return getResultsFromQuery(queryResponse);
}
public List<UrlDetails> doLinkSearch(String source, String dest) {
var queryParams = paramFactory.forLinkSearch(source, dest);
var queryResponse = queryClient.search(queryParams);
return getResultsFromQuery(queryResponse);
}
public DecoratedSearchResults doSearch(SearchParameters userParams) throws InterruptedException {
// The full user-facing search query does additional work to try to evaluate the query
// e.g. as a unit conversion query. This is done in parallel with the regular search.
Future<String> eval = searchUnitConversionService.tryEval(userParams.query());
// Perform the regular search
var queryParams = paramFactory.forRegularSearch(userParams);
QueryResponse queryResponse = queryClient.search(queryParams);
var queryResults = getResultsFromQuery(queryResponse);
// Cluster the results based on the query response
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
.selectStrategy(queryResponse)
.clusterResults(queryResults, 25);
// Log the query and results
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
// Get the evaluation result and other data to return to the user
String evalResult = getFutureOrDefault(eval, "");
String focusDomain = queryResponse.domain();
int focusDomainId = focusDomain == null
? -1
: domainQueries.tryGetDomainId(new EdgeDomain(focusDomain)).orElse(-1);
List<String> problems = getProblems(evalResult, queryResults, queryResponse);
List<DecoratedSearchResults.Page> resultPages = IntStream.rangeClosed(1, queryResponse.totalPages())
.mapToObj(number -> new DecoratedSearchResults.Page(
number,
number == userParams.page(),
userParams.withPage(number).renderUrl(websiteUrl)
))
.toList();
// Return the results to the user
return DecoratedSearchResults.builder()
.params(userParams)
.problems(problems)
.evalResult(evalResult)
.results(clusteredResults)
.filters(new SearchFilters(websiteUrl, userParams))
.focusDomain(focusDomain)
.focusDomainId(focusDomainId)
.resultPages(resultPages)
.build();
}
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
final QueryLimits limits = queryResponse.specs().queryLimits;
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
// Update the query count (this is what you see on the front page)
searchVisitorCount.registerQuery();
return queryResponse.results().stream()
.filter(deduplicator::shouldRetain)
.limit(limits.resultsTotal())
.map(SearchOperator::createDetails)
.toList();
}
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
return new UrlDetails(
item.documentId(),
item.domainId(),
cleanUrl(item.url),
item.title,
item.description,
item.format,
item.features,
DomainIndexingState.ACTIVE,
item.rankingScore, // termScore
item.resultsFromDomain,
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
Long.bitCount(item.bestPositions),
item.rawIndexResult,
item.rawIndexResult.keywordScores
);
}
/** Replace nuisance domains with replacements where available */
private static EdgeUrl cleanUrl(EdgeUrl url) {
String topdomain = url.domain.topDomain;
String subdomain = url.domain.subDomain;
String path = url.path;
if (topdomain.equals("fandom.com")) {
int wikiIndex = path.indexOf("/wiki/");
if (wikiIndex >= 0) {
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
}
}
else if (topdomain.equals("medium.com")) {
if (!subdomain.isBlank()) {
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
}
else {
String article = path.substring(path.indexOf("/", 1));
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
}
}
return url;
}
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) throws InterruptedException {
// We don't debug the query if it's a site search
if (response.domain() == null)
return List.of();
final List<String> problems = new ArrayList<>(response.problems());
if (queryResults.size() <= 5 && null == evalResult) {
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results.");
// Try to spell check the search terms
var suggestions = getFutureOrDefault(
mathClient.spellCheck(response.searchTermsHuman()),
Map.of()
);
suggestions.forEach((term, suggestion) -> {
if (suggestion.size() > 1) {
String suggestionsStr = "\"%s\" could be spelled %s".formatted(term, suggestion.stream().map(s -> "\"" + s + "\"").collect(Collectors.joining(", ")));
problems.add(suggestionsStr);
}
});
}
Set<String> representativeKeywords = response.getAllKeywords();
if (representativeKeywords.size() > 1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
{
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
}
return problems;
}
private <T> T getFutureOrDefault(@Nullable Future<T> fut, T defaultValue) {
return getFutureOrDefault(fut, Duration.ofMillis(50), defaultValue);
}
private <T> T getFutureOrDefault(@Nullable Future<T> fut, Duration timeout, T defaultValue) {
if (fut == null || fut.isCancelled()) {
return defaultValue;
}
try {
return fut.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
}
catch (Exception ex) {
logger.warn("Error fetching eval result", ex);
return defaultValue;
}
}
}

View File

@ -0,0 +1,104 @@
package nu.marginalia.search;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.search.command.SearchParameters;
import java.util.List;
public class SearchQueryParamFactory {
public QueryParams forRegularSearch(SearchParameters userParams) {
SearchQuery prototype = new SearchQuery();
var profile = userParams.profile();
profile.addTacitTerms(prototype);
userParams.js().addTacitTerms(prototype);
userParams.adtech().addTacitTerms(prototype);
return new QueryParams(
userParams.query(),
null,
prototype.searchTermsInclude,
prototype.searchTermsExclude,
prototype.searchTermsPriority,
prototype.searchTermsAdvice,
profile.getQualityLimit(),
profile.getYearLimit(),
profile.getSizeLimit(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(5, 100, 200, 8192),
profile.searchSetIdentifier.name(),
userParams.strategy(),
userParams.temporalBias(),
userParams.page()
);
}
public QueryParams forSiteSearch(String domain, int domainId, int count) {
return new QueryParams("site:"+domain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(domainId),
new QueryLimits(count, count, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
ResultRankingParameters.TemporalBias.NONE,
1
);
}
public QueryParams forBacklinkSearch(String domain) {
return new QueryParams("links:"+domain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
ResultRankingParameters.TemporalBias.NONE,
1
);
}
public QueryParams forLinkSearch(String sourceDomain, String destDomain) {
return new QueryParams("site:" + sourceDomain + " links:" + destDomain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
ResultRankingParameters.TemporalBias.NONE,
1
);
}
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.search;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.UrlDetails;
import java.util.List;
import java.util.stream.Collectors;
/** Functions for clustering search results */
public class SearchResultClusterer {
private SearchResultClusterer() {}
public interface SearchResultClusterStrategy {
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
}
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
if (response.domain() != null && !response.domain().isBlank())
return SearchResultClusterer::noOp;
return SearchResultClusterer::byDomain;
}
/** No clustering, just return the results as is */
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.map(ClusteredUrlDetails::new)
.toList();
}
/** Cluster the results by domain, and return the top "total" clusters
* sorted by the relevance of the best result
*/
private static List<ClusteredUrlDetails> byDomain(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.collect(
Collectors.groupingBy(details -> details.domainId)
)
.values().stream()
.map(ClusteredUrlDetails::new)
.sorted()
.limit(total)
.toList();
}
}

View File

@ -0,0 +1,128 @@
package nu.marginalia.search;
import com.google.inject.Inject;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.StaticResources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Route;
import spark.Spark;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
public class SearchService extends SparkService {
private final WebsiteUrl websiteUrl;
private final StaticResources staticResources;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
private static final Histogram wmsa_search_service_request_time = Histogram.build()
.name("wmsa_search_service_request_time")
.linearBuckets(0.05, 0.05, 15)
.labelNames("matchedPath", "method")
.help("Search service request time (seconds)")
.register();
private static final Counter wmsa_search_service_error_count = Counter.build()
.name("wmsa_search_service_error_count")
.labelNames("matchedPath", "method")
.help("Search service error count")
.register();
@Inject
public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
StaticResources staticResources,
SearchFrontPageService frontPageService,
SearchErrorPageService errorPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteInfoService siteInfoService,
SearchCrosstalkService crosstalkService,
SearchQueryService searchQueryService)
throws Exception
{
super(params);
this.websiteUrl = websiteUrl;
this.staticResources = staticResources;
Spark.staticFiles.expireTime(600);
SearchServiceMetrics.get("/search", searchQueryService::pathSearch);
SearchServiceMetrics.get("/", frontPageService::render);
SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed);
SearchServiceMetrics.get("/:resource", this::serveStatic);
SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling);
SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir);
SearchServiceMetrics.get("/site/:site", siteInfoService::handle);
SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost);
SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle);
Spark.exception(Exception.class, (e,p,q) -> {
logger.error("Error during processing", e);
wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc();
errorPageService.serveError(p, q);
});
Spark.awaitInitialization();
}
/** Wraps a route with a timer and a counter */
private static class SearchServiceMetrics implements Route {
private final Route delegatedRoute;
static void get(String path, Route route) {
Spark.get(path, new SearchServiceMetrics(route));
}
static void post(String path, Route route) {
Spark.post(path, new SearchServiceMetrics(route));
}
private SearchServiceMetrics(Route delegatedRoute) {
this.delegatedRoute = delegatedRoute;
}
@Override
public Object handle(Request request, Response response) throws Exception {
return wmsa_search_service_request_time
.labels(request.matchedPath(), request.requestMethod())
.time(() -> delegatedRoute.handle(request, response));
}
}
private Object serveStatic(Request request, Response response) {
String resource = request.params("resource");
staticResources.serveStatic("search", resource, request, response);
return "";
}
private Object siteSearchRedir(Request request, Response response) {
final String site = request.params("site");
final String searchTerms;
if (request.splat().length == 0) searchTerms = "";
else searchTerms = request.splat()[0];
final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim();
final String profile = request.queryParamOrDefault("profile", "yolo");
response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
return "";
}
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.search.command;
import com.google.inject.Inject;
import nu.marginalia.search.command.commands.*;
import spark.Response;
import java.util.ArrayList;
import java.util.List;
public class CommandEvaluator {
private final List<SearchCommandInterface> specialCommands = new ArrayList<>();
private final SearchCommand defaultCommand;
@Inject
public CommandEvaluator(
BrowseCommand browse,
ConvertCommand convert,
DefinitionCommand define,
BangCommand bang,
SiteRedirectCommand siteRedirect,
SearchCommand search
) {
specialCommands.add(browse);
specialCommands.add(convert);
specialCommands.add(define);
specialCommands.add(bang);
specialCommands.add(siteRedirect);
defaultCommand = search;
}
public Object eval(Response response, SearchParameters parameters) {
for (var cmd : specialCommands) {
var maybe = cmd.process(response, parameters);
if (maybe.isPresent())
return maybe.get();
}
return defaultCommand.process(response, parameters).orElse("");
}
}

View File

@ -0,0 +1,29 @@
package nu.marginalia.search.command;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import javax.annotation.Nullable;
import java.util.Arrays;
public enum SearchAdtechParameter {
DEFAULT("default"),
REDUCE("reduce", "special:ads", "special:affiliate");
public final String value;
public final String[] implictExcludeSearchTerms;
SearchAdtechParameter(String value, String... implictExcludeSearchTerms) {
this.value = value;
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
}
public static SearchAdtechParameter parse(@Nullable String value) {
if (REDUCE.value.equals(value)) return REDUCE;
return DEFAULT;
}
public void addTacitTerms(SearchQuery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.search.command;
import spark.Response;
import java.util.Optional;
public interface SearchCommandInterface {
Optional<Object> process(Response response, SearchParameters parameters);
}

View File

@ -0,0 +1,31 @@
package nu.marginalia.search.command;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import javax.annotation.Nullable;
import java.util.Arrays;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
public final String value;
public final String[] implictExcludeSearchTerms;
SearchJsParameter(String value, String... implictExcludeSearchTerms) {
this.value = value;
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
}
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}
public void addTacitTerms(SearchQuery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -0,0 +1,106 @@
package nu.marginalia.search.command;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.search.model.SearchProfile;
import spark.Request;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import static nu.marginalia.search.command.SearchRecentParameter.RECENT;
public record SearchParameters(String query,
SearchProfile profile,
SearchJsParameter js,
SearchRecentParameter recent,
SearchTitleParameter searchTitle,
SearchAdtechParameter adtech,
boolean newFilter,
int page
) {
public SearchParameters(String queryString, Request request) {
this(
queryString,
SearchProfile.getSearchProfile(request.queryParams("profile")),
SearchJsParameter.parse(request.queryParams("js")),
SearchRecentParameter.parse(request.queryParams("recent")),
SearchTitleParameter.parse(request.queryParams("searchTitle")),
SearchAdtechParameter.parse(request.queryParams("adtech")),
"true".equals(request.queryParams("newfilter")),
Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "1"))
);
}
public String profileStr() {
return profile.filterId;
}
public SearchParameters withProfile(SearchProfile profile) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withJs(SearchJsParameter js) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withAdtech(SearchAdtechParameter adtech) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withRecent(SearchRecentParameter recent) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withTitle(SearchTitleParameter title) {
return new SearchParameters(query, profile, js, recent, title, adtech, true, page);
}
public SearchParameters withPage(int page) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, false, page);
}
public String renderUrl(WebsiteUrl baseUrl) {
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
URLEncoder.encode(query, StandardCharsets.UTF_8),
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
Boolean.valueOf(newFilter).toString(),
page
);
return baseUrl.withPath(path);
}
public ResultRankingParameters.TemporalBias temporalBias() {
if (recent == RECENT) {
return ResultRankingParameters.TemporalBias.RECENT;
}
else if (profile == SearchProfile.VINTAGE) {
return ResultRankingParameters.TemporalBias.OLD;
}
return ResultRankingParameters.TemporalBias.NONE;
}
public QueryStrategy strategy() {
if (searchTitle == SearchTitleParameter.TITLE) {
return QueryStrategy.REQUIRE_FIELD_TITLE;
}
return QueryStrategy.AUTO;
}
public SpecificationLimit yearLimit() {
if (recent == RECENT)
return SpecificationLimit.greaterThan(2018);
return profile.getYearLimit();
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.search.command;
import javax.annotation.Nullable;
public enum SearchRecentParameter {
DEFAULT("default"),
RECENT("recent");
public final String value;
SearchRecentParameter(String value) {
this.value = value;
}
public static SearchRecentParameter parse(@Nullable String value) {
if (RECENT.value.equals(value)) return RECENT;
return DEFAULT;
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.search.command;
import javax.annotation.Nullable;
public enum SearchTitleParameter {
DEFAULT("default"),
TITLE("title");
public final String value;
SearchTitleParameter(String value) {
this.value = value;
}
public static SearchTitleParameter parse(@Nullable String value) {
if (TITLE.value.equals(value)) return TITLE;
return DEFAULT;
}
}

View File

@ -0,0 +1,104 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.exceptions.RedirectException;
import spark.Response;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
public class BangCommand implements SearchCommandInterface {
private final Map<String, String> bangsToPattern = new HashMap<>();
@Inject
public BangCommand()
{
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
for (var entry : bangsToPattern.entrySet()) {
String bangPattern = entry.getKey();
String redirectPattern = entry.getValue();
var match = matchBangPattern(parameters.query(), bangPattern);
if (match.isPresent()) {
var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8));
throw new RedirectException(url);
}
}
return Optional.empty();
}
/** If the query contains the bang pattern bangKey, return the query with the bang pattern removed. */
Optional<String> matchBangPattern(String query, String bangKey) {
var bm = new BangMatcher(query);
while (bm.findNext(bangKey)) {
if (!bm.isRelativeSpaceOrInvalid(-1))
continue;
if (!bm.isRelativeSpaceOrInvalid(bangKey.length()))
continue;
String prefix = bm.prefix().trim();
String suffix = bm.suffix(bangKey.length()).trim();
String ret = (prefix + " " + suffix).trim();
return Optional.of(ret)
.filter(s -> !s.isBlank());
}
return Optional.empty();
}
private static class BangMatcher {
private final String str;
private int pos;
public String prefix() {
return str.substring(0, pos);
}
public String suffix(int offset) {
if (pos+offset < str.length())
return str.substring(pos + offset);
return "";
}
public BangMatcher(String str) {
this.str = str;
this.pos = -1;
}
public boolean findNext(String pattern) {
if (pos + 1 >= str.length())
return false;
return (pos = str.indexOf(pattern, pos + 1)) >= 0;
}
public boolean isRelativeSpaceOrInvalid(int offset) {
if (offset + pos < 0)
return true;
if (offset + pos >= str.length())
return true;
return Character.isSpaceChar(str.charAt(offset + pos));
}
}
}

View File

@ -0,0 +1,36 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.svc.SearchUnitConversionService;
import spark.Response;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
public class ConvertCommand implements SearchCommandInterface {
private final SearchUnitConversionService searchUnitConversionService;
private final MustacheRenderer<Map<String, String>> conversionRenderer;
@Inject
public ConvertCommand(SearchUnitConversionService searchUnitConversionService, RendererFactory rendererFactory) throws IOException {
this.searchUnitConversionService = searchUnitConversionService;
conversionRenderer = rendererFactory.renderer("search/conversion-results");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
var conversion = searchUnitConversionService.tryConversion(parameters.query());
return conversion.map(s -> conversionRenderer.render(Map.of(
"query", parameters.query(),
"result", s,
"profile", parameters.profileStr())
));
}
}

View File

@ -0,0 +1,70 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.api.math.MathClient;
import nu.marginalia.api.math.model.DictionaryResponse;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.renderer.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Response;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class DefinitionCommand implements SearchCommandInterface {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final MustacheRenderer<DictionaryResponse> dictionaryRenderer;
private final MathClient mathClient;
private final Predicate<String> queryPatternPredicate = Pattern.compile("^define:[A-Za-z\\s-0-9]+$").asPredicate();
@Inject
public DefinitionCommand(RendererFactory rendererFactory, MathClient mathClient)
throws IOException
{
dictionaryRenderer = rendererFactory.renderer("search/dictionary-results");
this.mathClient = mathClient;
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
if (!queryPatternPredicate.test(parameters.query())) {
return Optional.empty();
}
var results = lookupDefinition(parameters.query());
return Optional.of(dictionaryRenderer.render(results,
Map.of("query", parameters.query(),
"profile", parameters.profileStr())
));
}
private DictionaryResponse lookupDefinition(String humanQuery) {
String definePrefix = "define:";
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
try {
return mathClient
.dictionaryLookup(word)
.get(250, TimeUnit.MILLISECONDS);
}
catch (Exception e) {
logger.error("Failed to lookup definition for word: " + word, e);
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.DecoratedSearchResults;
import spark.Response;
import java.io.IOException;
import java.util.Optional;
public class SearchCommand implements SearchCommandInterface {
private final SearchOperator searchOperator;
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
@Inject
public SearchCommand(SearchOperator searchOperator,
RendererFactory rendererFactory) throws IOException {
this.searchOperator = searchOperator;
searchResultsRenderer = rendererFactory.renderer("search/search-results");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
try {
DecoratedSearchResults results = searchOperator.doSearch(parameters);
return Optional.of(searchResultsRenderer.render(results));
}
catch (InterruptedException ex) {
Thread.currentThread().interrupt();
return Optional.empty();
}
}
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Response;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class SiteRedirectCommand implements SearchCommandInterface {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Predicate<String> queryPatternPredicate = Pattern.compile("^(site|links):[.A-Za-z\\-0-9]+$").asPredicate();
@Inject
public SiteRedirectCommand() {
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
if (!queryPatternPredicate.test(parameters.query())) {
return Optional.empty();
}
int idx = parameters.query().indexOf(':');
String prefix = parameters.query().substring(0, idx);
String domain = parameters.query().substring(idx + 1).toLowerCase();
// Use an HTML redirect here, so we can use relative URLs
String view = switch (prefix) {
case "links" -> "links";
default -> "info";
};
return Optional.of("""
<!DOCTYPE html>
<html lang="en">
<meta charset="UTF-8">
<title>Redirecting...</title>
<meta http-equiv="refresh" content="0; url=/site/%s?view=%s">
""".formatted(domain, view)
);
}
}

View File

@ -0,0 +1,66 @@
package nu.marginalia.search.db;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
public class DbNearDomainsQuery {
private final HikariDataSource dataSource;
@Inject
public DbNearDomainsQuery(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public List<Integer> getRelatedDomains(String term, Consumer<String> onProblem) {
List<Integer> ret = new ArrayList<>();
try (var conn = dataSource.getConnection();
var selfStmt = conn.prepareStatement("""
SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?
""");
var stmt = conn.prepareStatement("""
SELECT NEIGHBOR_ID, ND.INDEXED, ND.STATE FROM EC_DOMAIN_NEIGHBORS_2
INNER JOIN EC_DOMAIN ND ON ND.ID=NEIGHBOR_ID
WHERE DOMAIN_ID=?
""")) {
ResultSet rsp;
selfStmt.setString(1, term);
rsp = selfStmt.executeQuery();
int domainId = -1;
if (rsp.next()) {
domainId = rsp.getInt(1);
ret.add(domainId);
}
stmt.setInt(1, domainId);
rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
int indexed = rsp.getInt(2);
String state = rsp.getString(3);
if (indexed > 0 && ("ACTIVE".equalsIgnoreCase(state) || "SOCIAL_MEDIA".equalsIgnoreCase(state) || "SPECIAL".equalsIgnoreCase(state))) {
ret.add(id);
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
if (ret.isEmpty()) {
onProblem.accept("Could not find domains adjacent " + term);
}
return ret;
}
}

View File

@ -0,0 +1,102 @@
package nu.marginalia.search.model;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.idx.WordFlags;
import org.jetbrains.annotations.NotNull;
import java.util.*;
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
* and the rest are additional results, for summary display. */
public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
@NotNull
public final UrlDetails first;
@NotNull
public final List<UrlDetails> rest;
/** Create a new ClusteredUrlDetails from a collection of UrlDetails,
* with the best result as "first", and the others, in descending order
* of quality as the "rest"...
*
* @param details A collection of UrlDetails, which must not be empty.
*/
public ClusteredUrlDetails(Collection<UrlDetails> details) {
var items = new ArrayList<>(details);
items.sort(Comparator.naturalOrder());
if (items.isEmpty())
throw new IllegalArgumentException("Empty list of details");
this.first = items.removeFirst();
this.rest = items;
double bestScore = first.termScore;
double scoreLimit = Math.min(4.0, bestScore * 1.25);
this.rest.removeIf(urlDetail -> {
if (urlDetail.termScore > scoreLimit)
return false;
for (var keywordScore : urlDetail.resultItem.keywordScores) {
if (keywordScore.isKeywordSpecial())
continue;
if (keywordScore.hasTermFlag(WordFlags.Title))
return false;
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
return false;
if (keywordScore.hasTermFlag(WordFlags.Subjects))
return false;
}
return true;
});
}
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
this.first = onlyFirst;
this.rest = Collections.emptyList();
}
// For renderer use, do not remove
public @NotNull UrlDetails getFirst() {
return first;
}
// For renderer use, do not remove
public @NotNull List<UrlDetails> getRest() {
return rest;
}
public EdgeDomain getDomain() {
return first.url.getDomain();
}
public boolean hasMultiple() {
return !rest.isEmpty();
}
/** Returns the total number of results from the same domain,
* including such results that are not included here. */
public int totalCount() {
return first.resultsFromSameDomain;
}
public int remainingCount() {
return totalCount() - 1 - rest.size();
}
@Override
public int compareTo(@NotNull ClusteredUrlDetails o) {
return Objects.compare(first, o.first, UrlDetails::compareTo);
}
}

View File

@ -0,0 +1,186 @@
package nu.marginalia.search.model;
import nu.marginalia.search.command.SearchParameters;
import java.util.List;
/**
* A class to hold details about the search results,
* as used by the handlebars templating engine to render
* the search results page.
*/
public class DecoratedSearchResults {
private final SearchParameters params;
private final List<String> problems;
private final String evalResult;
public DecoratedSearchResults(SearchParameters params,
List<String> problems,
String evalResult,
List<ClusteredUrlDetails> results,
String focusDomain,
int focusDomainId,
SearchFilters filters,
List<Page> resultPages) {
this.params = params;
this.problems = problems;
this.evalResult = evalResult;
this.results = results;
this.focusDomain = focusDomain;
this.focusDomainId = focusDomainId;
this.filters = filters;
this.resultPages = resultPages;
}
public final List<ClusteredUrlDetails> results;
public static DecoratedSearchResultsBuilder builder() {
return new DecoratedSearchResultsBuilder();
}
public SearchParameters getParams() {
return params;
}
public List<String> getProblems() {
return problems;
}
public String getEvalResult() {
return evalResult;
}
public List<ClusteredUrlDetails> getResults() {
return results;
}
public String getFocusDomain() {
return focusDomain;
}
public int getFocusDomainId() {
return focusDomainId;
}
public SearchFilters getFilters() {
return filters;
}
public List<Page> getResultPages() {
return resultPages;
}
private final String focusDomain;
private final int focusDomainId;
private final SearchFilters filters;
private final List<Page> resultPages;
public boolean isMultipage() {
return resultPages.size() > 1;
}
public record Page(int number, boolean current, String href) {
}
// These are used by the search form, they look unused in the IDE but are used by the mustache template,
// DO NOT REMOVE THEM
public int getResultCount() {
return results.size();
}
public String getQuery() {
return params.query();
}
public String getProfile() {
return params.profile().filterId;
}
public String getJs() {
return params.js().value;
}
public String getAdtech() {
return params.adtech().value;
}
public String getRecent() {
return params.recent().value;
}
public String getSearchTitle() {
return params.searchTitle().value;
}
public int page() {
return params.page();
}
public Boolean isNewFilter() {
return params.newFilter();
}
public static class DecoratedSearchResultsBuilder {
private SearchParameters params;
private List<String> problems;
private String evalResult;
private List<ClusteredUrlDetails> results;
private String focusDomain;
private int focusDomainId;
private SearchFilters filters;
private List<Page> resultPages;
DecoratedSearchResultsBuilder() {
}
public DecoratedSearchResultsBuilder params(SearchParameters params) {
this.params = params;
return this;
}
public DecoratedSearchResultsBuilder problems(List<String> problems) {
this.problems = problems;
return this;
}
public DecoratedSearchResultsBuilder evalResult(String evalResult) {
this.evalResult = evalResult;
return this;
}
public DecoratedSearchResultsBuilder results(List<ClusteredUrlDetails> results) {
this.results = results;
return this;
}
public DecoratedSearchResultsBuilder focusDomain(String focusDomain) {
this.focusDomain = focusDomain;
return this;
}
public DecoratedSearchResultsBuilder focusDomainId(int focusDomainId) {
this.focusDomainId = focusDomainId;
return this;
}
public DecoratedSearchResultsBuilder filters(SearchFilters filters) {
this.filters = filters;
return this;
}
public DecoratedSearchResultsBuilder resultPages(List<Page> resultPages) {
this.resultPages = resultPages;
return this;
}
public DecoratedSearchResults build() {
return new DecoratedSearchResults(this.params, this.problems, this.evalResult, this.results, this.focusDomain, this.focusDomainId, this.filters, this.resultPages);
}
public String toString() {
return "DecoratedSearchResults.DecoratedSearchResultsBuilder(params=" + this.params + ", problems=" + this.problems + ", evalResult=" + this.evalResult + ", results=" + this.results + ", focusDomain=" + this.focusDomain + ", focusDomainId=" + this.focusDomainId + ", filters=" + this.filters + ", resultPages=" + this.resultPages + ")";
}
}
}

View File

@ -0,0 +1,223 @@
package nu.marginalia.search.model;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.*;
import java.util.List;
/** Models the search filters displayed next to the search results */
public class SearchFilters {
private final WebsiteUrl url;
public final String currentFilter;
// These are necessary for the renderer to access the data
public final RemoveJsOption removeJsOption;
public final ReduceAdtechOption reduceAdtechOption;
public final ShowRecentOption showRecentOption;
public final SearchTitleOption searchTitleOption;
public final List<List<Filter>> filterGroups;
// Getters are for the renderer to access the data
public String getCurrentFilter() {
return currentFilter;
}
public RemoveJsOption getRemoveJsOption() {
return removeJsOption;
}
public ReduceAdtechOption getReduceAdtechOption() {
return reduceAdtechOption;
}
public ShowRecentOption getShowRecentOption() {
return showRecentOption;
}
public SearchTitleOption getSearchTitleOption() {
return searchTitleOption;
}
public List<List<Filter>> getFilterGroups() {
return filterGroups;
}
public SearchFilters(WebsiteUrl url, SearchParameters parameters) {
this.url = url;
removeJsOption = new RemoveJsOption(parameters);
reduceAdtechOption = new ReduceAdtechOption(parameters);
showRecentOption = new ShowRecentOption(parameters);
searchTitleOption = new SearchTitleOption(parameters);
currentFilter = parameters.profile().filterId;
filterGroups = List.of(
List.of(
new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
// new Filter("Popular", SearchProfile.POPULAR, parameters),
new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
new Filter("Academia", SearchProfile.ACADEMIA, parameters)
),
List.of(
new Filter("Vintage", SearchProfile.VINTAGE, parameters),
new Filter("Plain Text", SearchProfile.PLAIN_TEXT, parameters),
new Filter("~tilde", SearchProfile.TILDE, parameters)
),
List.of(
new Filter("Wiki", SearchProfile.WIKI, parameters),
new Filter("Forum", SearchProfile.FORUM, parameters),
new Filter("Docs", SearchProfile.DOCS, parameters),
new Filter("Recipes", SearchProfile.FOOD, parameters)
)
);
}
public class RemoveJsOption {
private final SearchJsParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchJsParameter.DENY_JS);
}
public String name() {
return "Remove Javascript";
}
public RemoveJsOption(SearchParameters parameters) {
this.value = parameters.js();
var toggledValue = switch (parameters.js()) {
case DENY_JS -> SearchJsParameter.DEFAULT;
default -> SearchJsParameter.DENY_JS;
};
this.url = parameters.withJs(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class ReduceAdtechOption {
private final SearchAdtechParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchAdtechParameter.REDUCE);
}
public String name() {
return "Reduce Adtech";
}
public ReduceAdtechOption(SearchParameters parameters) {
this.value = parameters.adtech();
var toggledValue = switch (parameters.adtech()) {
case REDUCE -> SearchAdtechParameter.DEFAULT;
default -> SearchAdtechParameter.REDUCE;
};
this.url = parameters.withAdtech(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class ShowRecentOption {
private final SearchRecentParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchRecentParameter.RECENT);
}
public String name() {
return "Recent Results";
}
public ShowRecentOption(SearchParameters parameters) {
this.value = parameters.recent();
var toggledValue = switch (parameters.recent()) {
case RECENT -> SearchRecentParameter.DEFAULT;
default -> SearchRecentParameter.RECENT;
};
this.url = parameters.withRecent(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class SearchTitleOption {
private final SearchTitleParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchTitleParameter.TITLE);
}
public String name() {
return "Search In Title";
}
public SearchTitleOption(SearchParameters parameters) {
this.value = parameters.searchTitle();
var toggledValue = switch (parameters.searchTitle()) {
case TITLE -> SearchTitleParameter.DEFAULT;
default -> SearchTitleParameter.TITLE;
};
this.url = parameters.withTitle(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class Filter {
public final SearchProfile profile;
public final String displayName;
public final boolean current;
public final String url;
public Filter(String displayName, SearchProfile profile, SearchParameters parameters) {
this.displayName = displayName;
this.profile = profile;
this.current = profile.equals(parameters.profile());
this.url = parameters.withProfile(profile).renderUrl(SearchFilters.this.url);
}
public String getDisplayName() {
return displayName;
}
public boolean isCurrent() {
return current;
}
public String getUrl() {
return url;
}
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.search.model;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import java.util.Objects;
public enum SearchProfile {
POPULAR("default", SearchSetIdentifier.POPULAR),
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
NO_FILTER("corpo", SearchSetIdentifier.NONE),
VINTAGE("vintage", SearchSetIdentifier.NONE),
TILDE("tilde", SearchSetIdentifier.NONE),
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
ACADEMIA("academia", SearchSetIdentifier.NONE),
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
FOOD("food", SearchSetIdentifier.POPULAR),
FORUM("forum", SearchSetIdentifier.NONE),
WIKI("wiki", SearchSetIdentifier.NONE),
DOCS("docs", SearchSetIdentifier.NONE),
;
public final String filterId;
public final SearchSetIdentifier searchSetIdentifier;
SearchProfile(String filterId, SearchSetIdentifier searchSetIdentifier) {
this.filterId = filterId;
this.searchSetIdentifier = searchSetIdentifier;
}
private final static SearchProfile[] values = values();
public static SearchProfile getSearchProfile(String param) {
if (null == param) {
return NO_FILTER;
}
for (var profile : values) {
if (Objects.equals(profile.filterId, param)) {
return profile;
}
}
return NO_FILTER;
}
public void addTacitTerms(SearchQuery subquery) {
if (this == ACADEMIA) {
subquery.searchTermsAdvice.add("special:academia");
}
if (this == VINTAGE) {
subquery.searchTermsPriority.add("format:html123");
subquery.searchTermsPriority.add("js:false");
}
if (this == TILDE) {
subquery.searchTermsAdvice.add("special:tilde");
}
if (this == PLAIN_TEXT) {
subquery.searchTermsAdvice.add("format:plain");
}
if (this == WIKI) {
subquery.searchTermsAdvice.add("generator:wiki");
}
if (this == FORUM) {
subquery.searchTermsAdvice.add("generator:forum");
}
if (this == DOCS) {
subquery.searchTermsAdvice.add("generator:docs");
}
if (this == FOOD) {
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
subquery.searchTermsExclude.add("special:ads");
}
}
public SpecificationLimit getYearLimit() {
if (this == SMALLWEB) {
return SpecificationLimit.greaterThan(2015);
}
if (this == VINTAGE) {
return SpecificationLimit.lessThan(2003);
}
else return SpecificationLimit.none();
}
public SpecificationLimit getSizeLimit() {
if (this == SMALLWEB) {
return SpecificationLimit.lessThan(500);
}
else return SpecificationLimit.none();
}
public SpecificationLimit getQualityLimit() {
if (this == SMALLWEB) {
return SpecificationLimit.lessThan(5);
}
else return SpecificationLimit.none();
}
}

View File

@ -0,0 +1,293 @@
package nu.marginalia.search.model;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import java.util.ArrayList;
import java.util.List;
/**
* A class to hold details about a single search result.
*/
public class UrlDetails implements Comparable<UrlDetails> {
public long id;
public int domainId;
public EdgeUrl url;
public String title;
public String description;
public String format;
public int features;
public DomainIndexingState domainState;
public double termScore;
public int resultsFromSameDomain;
public String positions;
public int positionsCount;
public SearchResultItem resultItem;
public List<SearchResultKeywordScore> keywordScores;
public UrlDetails(long id, int domainId, EdgeUrl url, String title, String description, String format, int features, DomainIndexingState domainState, double termScore, int resultsFromSameDomain, String positions, int positionsCount, SearchResultItem resultItem, List<SearchResultKeywordScore> keywordScores) {
this.id = id;
this.domainId = domainId;
this.url = url;
this.title = title;
this.description = description;
this.format = format;
this.features = features;
this.domainState = domainState;
this.termScore = termScore;
this.resultsFromSameDomain = resultsFromSameDomain;
this.positions = positions;
this.positionsCount = positionsCount;
this.resultItem = resultItem;
this.keywordScores = keywordScores;
}
public UrlDetails() {
}
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;
}
public String getFormat() {
if (null == format) {
return "?";
}
switch (format) {
case "HTML123":
return "HTML 1-3";
case "HTML4":
return "HTML 4";
case "XHTML":
return "XHTML";
case "HTML5":
return "HTML 5";
case "PLAIN":
return "Plain Text";
default:
return "?";
}
}
public int hashCode() {
return Long.hashCode(id);
}
@Override
public int compareTo(UrlDetails other) {
int result = Double.compare(getTermScore(), other.getTermScore());
if (result == 0) result = Long.compare(getId(), other.getId());
return result;
}
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (other == this) {
return true;
}
if (other instanceof UrlDetails) {
return ((UrlDetails) other).id == id;
}
return false;
}
public String getTitle() {
if (title == null || title.isBlank()) {
return url.toString();
}
return title;
}
public boolean isPlainText() {
return "PLAIN".equals(format);
}
public int getProblemCount() {
int mask = HtmlFeature.JS.getFeatureBit()
| HtmlFeature.COOKIES.getFeatureBit()
| HtmlFeature.TRACKING.getFeatureBit()
| HtmlFeature.AFFILIATE_LINK.getFeatureBit()
| HtmlFeature.TRACKING_ADTECH.getFeatureBit()
| HtmlFeature.ADVERTISEMENT.getFeatureBit();
return Integer.bitCount(features & mask);
}
public List<UrlProblem> getProblems() {
List<UrlProblem> problems = new ArrayList<>();
if (isScripts()) {
problems.add(new UrlProblem("Js", "The page uses Javascript"));
}
if (isCookies()) {
problems.add(new UrlProblem("Co", "The page uses Cookies"));
}
if (isTracking()) {
problems.add(new UrlProblem("Tr", "The page uses Tracking/Analytics"));
}
if (isAffiliate()) {
problems.add(new UrlProblem("Af", "The page may use Affiliate Linking"));
}
if (isAds()) {
problems.add(new UrlProblem("Ad", "The page uses Ads/Adtech Tracking"));
}
return problems;
}
public boolean isScripts() {
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
}
public boolean isTracking() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
}
public boolean isAffiliate() {
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
}
public boolean isMedia() {
return HtmlFeature.hasFeature(features, HtmlFeature.MEDIA);
}
public boolean isCookies() {
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
}
public boolean isAds() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH);
}
public int getMatchRank() {
if (termScore <= 1) return 1;
if (termScore <= 2) return 2;
if (termScore <= 3) return 3;
if (termScore <= 5) return 5;
return 10;
}
public long getId() {
return this.id;
}
public int getDomainId() {
return this.domainId;
}
public EdgeUrl getUrl() {
return this.url;
}
public String getDescription() {
return this.description;
}
public int getFeatures() {
return this.features;
}
public DomainIndexingState getDomainState() {
return this.domainState;
}
public double getTermScore() {
return this.termScore;
}
public int getResultsFromSameDomain() {
return this.resultsFromSameDomain;
}
public String getPositions() {
return this.positions;
}
public int getPositionsCount() {
return this.positionsCount;
}
public SearchResultItem getResultItem() {
return this.resultItem;
}
public List<SearchResultKeywordScore> getKeywordScores() {
return this.keywordScores;
}
public UrlDetails withId(long id) {
return this.id == id ? this : new UrlDetails(id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withDomainId(int domainId) {
return this.domainId == domainId ? this : new UrlDetails(this.id, domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withUrl(EdgeUrl url) {
return this.url == url ? this : new UrlDetails(this.id, this.domainId, url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withTitle(String title) {
return this.title == title ? this : new UrlDetails(this.id, this.domainId, this.url, title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withDescription(String description) {
return this.description == description ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withFormat(String format) {
return this.format == format ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withFeatures(int features) {
return this.features == features ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withDomainState(DomainIndexingState domainState) {
return this.domainState == domainState ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withTermScore(double termScore) {
return this.termScore == termScore ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withResultsFromSameDomain(int resultsFromSameDomain) {
return this.resultsFromSameDomain == resultsFromSameDomain ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withPositions(String positions) {
return this.positions == positions ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withPositionsCount(int positionsCount) {
return this.positionsCount == positionsCount ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withResultItem(SearchResultItem resultItem) {
return this.resultItem == resultItem ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, resultItem, this.keywordScores);
}
public UrlDetails withKeywordScores(List<SearchResultKeywordScore> keywordScores) {
return this.keywordScores == keywordScores ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, keywordScores);
}
public String toString() {
return "UrlDetails(id=" + this.getId() + ", domainId=" + this.getDomainId() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", domainState=" + this.getDomainState() + ", termScore=" + this.getTermScore() + ", resultsFromSameDomain=" + this.getResultsFromSameDomain() + ", positions=" + this.getPositions() + ", positionsCount=" + this.getPositionsCount() + ", resultItem=" + this.getResultItem() + ", keywordScores=" + this.getKeywordScores() + ")";
}
public static record UrlProblem(String name, String description) {
}
}

View File

@ -0,0 +1,27 @@
package nu.marginalia.search.results;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.screenshot.ScreenshotService;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Predicate;
@Singleton
public class BrowseResultCleaner {
private final ScreenshotService screenshotService;
@Inject
public BrowseResultCleaner(ScreenshotService screenshotService) {
this.screenshotService = screenshotService;
}
public Predicate<BrowseResult> shouldRemoveResultPredicateBr() {
Set<String> domainHashes = new HashSet<>(100);
return (res) -> !screenshotService.hasScreenshot(res.domainId())
|| !domainHashes.add(res.domainHash());
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.search.results;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.lsh.EasyLSH;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Objects;
public class UrlDeduplicator {
private final int LSH_SIMILARITY_THRESHOLD = 2;
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200);
private final TLongList seehLSHList = new TLongArrayList(200);
private final TObjectIntHashMap<String> keyCount = new TObjectIntHashMap<>(200, 0.75f, 0);
private final int resultsPerKey;
public UrlDeduplicator(int resultsPerKey) {
this.resultsPerKey = resultsPerKey;
}
public boolean shouldRemove(DecoratedSearchResultItem details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
return true;
if (!limitResultsPerDomain(details))
return true;
return false;
}
public boolean shouldRetain(DecoratedSearchResultItem details) {
return !shouldRemove(details);
}
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
}
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
long thisHash = details.dataHash;
if (0 == thisHash)
return true;
if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD))
{
seehLSHList.add(thisHash);
return true;
}
return false;
}
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.db.DbDomainQueries;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.sql.SQLException;
public class SearchAddToCrawlQueueService {
private final DbDomainQueries domainQueries;
private final WebsiteUrl websiteUrl;
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(SearchAddToCrawlQueueService.class);
@Inject
public SearchAddToCrawlQueueService(DbDomainQueries domainQueries,
WebsiteUrl websiteUrl,
HikariDataSource dataSource) {
this.domainQueries = domainQueries;
this.websiteUrl = websiteUrl;
this.dataSource = dataSource;
}
public Object suggestCrawling(Request request, Response response) throws SQLException {
logger.info("{}", request.queryParams());
int id = Integer.parseInt(request.queryParams("id"));
boolean nomisclick = "on".equals(request.queryParams("nomisclick"));
String domainName = getDomainName(id);
if (nomisclick) {
logger.info("Adding {} to crawl queue", domainName);
addToCrawlQueue(id);
}
else {
logger.info("Nomisclick not set, not adding {} to crawl queue", domainName);
}
response.redirect(websiteUrl.withPath("/site/" + domainName));
return "";
}
private void addToCrawlQueue(int id) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
""")) {
stmt.setInt(1, id);
stmt.executeUpdate();
}
}
private String getDomainName(int id) {
var domain = domainQueries.getDomain(id);
if (domain.isEmpty())
Spark.halt(404);
return domain.get().toString();
}
}

View File

@ -0,0 +1,87 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.browse.DbBrowseDomainsRandom;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.search.results.BrowseResultCleaner;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static java.util.Collections.shuffle;
public class SearchBrowseService {
private final DbBrowseDomainsRandom randomDomains;
private final DbDomainQueries domainQueries;
private final DomainBlacklist blacklist;
private final DomainInfoClient domainInfoClient;
private final BrowseResultCleaner browseResultCleaner;
@Inject
public SearchBrowseService(DbBrowseDomainsRandom randomDomains,
DbDomainQueries domainQueries,
DomainBlacklist blacklist,
DomainInfoClient domainInfoClient,
BrowseResultCleaner browseResultCleaner)
{
this.randomDomains = randomDomains;
this.domainQueries = domainQueries;
this.blacklist = blacklist;
this.domainInfoClient = domainInfoClient;
this.browseResultCleaner = browseResultCleaner;
}
public BrowseResultSet getRandomEntries(int set) {
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr());
return new BrowseResultSet(results);
}
public BrowseResultSet getRelatedEntries(String domainName) throws ExecutionException, InterruptedException, TimeoutException {
var domain = domainQueries.getDomainId(new EdgeDomain(domainName));
var neighbors = domainInfoClient.similarDomains(domain, 50)
.get(100, TimeUnit.MILLISECONDS);
neighbors.removeIf(sd -> !sd.screenshot());
// If the results are very few, supplement with the alternative shitty algorithm
if (neighbors.size() < 25) {
Set<SimilarDomain> allNeighbors = new HashSet<>(neighbors);
allNeighbors.addAll(domainInfoClient
.linkedDomains(domain, 50)
.get(100, TimeUnit.MILLISECONDS)
);
neighbors.clear();
neighbors.addAll(allNeighbors);
neighbors.removeIf(sd -> !sd.screenshot());
}
List<BrowseResult> results = new ArrayList<>(neighbors.size());
for (SimilarDomain sd : neighbors) {
var resultDomain = domainQueries.getDomain(sd.domainId());
if (resultDomain.isEmpty())
continue;
results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot()));
}
// shuffle the items for a less repetitive experience
shuffle(neighbors);
return new BrowseResultSet(results, domainName);
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.model.UrlDetails;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
public class SearchCrosstalkService {
private static final Logger logger = LoggerFactory.getLogger(SearchCrosstalkService.class);
private final SearchOperator searchOperator;
private final MustacheRenderer<CrosstalkResult> renderer;
@Inject
public SearchCrosstalkService(SearchOperator searchOperator,
RendererFactory rendererFactory) throws IOException
{
this.searchOperator = searchOperator;
this.renderer = rendererFactory.renderer("search/site-info/site-crosstalk");
}
public Object handle(Request request, Response response) throws SQLException {
String domains = request.queryParams("domains");
String[] parts = StringUtils.split(domains, ',');
if (parts.length != 2) {
throw new IllegalArgumentException("Expected exactly two domains");
}
response.type("text/html");
for (int i = 0; i < parts.length; i++) {
parts[i] = parts[i].trim();
}
var resAtoB = searchOperator.doLinkSearch(parts[0], parts[1]);
var resBtoA = searchOperator.doLinkSearch(parts[1], parts[0]);
var model = new CrosstalkResult(parts[0], parts[1], resAtoB, resBtoA);
return renderer.render(model);
}
private record CrosstalkResult(String domainA,
String domainB,
List<UrlDetails> forward,
List<UrlDetails> backward)
{
public boolean isFocusDomain() {
return true; // Hack to get the search result templates behave well
}
public boolean hasBoth() {
return !forward.isEmpty() && !backward.isEmpty();
}
}
}

View File

@ -0,0 +1,47 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.index.api.IndexMqClient;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.util.Map;
public class SearchErrorPageService {
private final IndexMqClient indexMqClient;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final MustacheRenderer<Object> renderer;
@Inject
public SearchErrorPageService(IndexMqClient indexMqClient,
RendererFactory rendererFactory) throws IOException {
renderer = rendererFactory.renderer("search/error-page-search");
this.indexMqClient = indexMqClient;
}
public void serveError(Request request, Response rsp) {
rsp.body(renderError(request, "Internal error",
"""
An error occurred when communicating with the search engine index.
<p>
This is hopefully a temporary state of affairs. It may be due to
an upgrade. The index typically takes a about two or three minutes
to reload from a cold restart. Thanks for your patience.
"""));
}
private String renderError(Request request, String title, String message) {
return renderer.render(Map.of("title", title, "message", message,
"profile", request.queryParamOrDefault("profile", ""),
"js", request.queryParamOrDefault("js", ""),
"query", request.queryParamOrDefault("query", "")
));
}
}

View File

@ -0,0 +1,85 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
/** Service for handling flagging sites. This code has an admin-facing correspondent in
* DomainComplaintService in control-service
*/
public class SearchFlagSiteService {
private final HikariDataSource dataSource;
private final CategoryItem unknownCategory = new CategoryItem("unknown", "Unknown");
private final List<CategoryItem> categories =
List.of(
new CategoryItem("spam", "Spam"),
new CategoryItem("freebooting", "Reposting Stolen Content"),
new CategoryItem("broken", "Broken Website"),
new CategoryItem("shock", "Shocking/Offensive"),
new CategoryItem("blacklist", "Review Blacklisting"),
new CategoryItem("no-random", "Remove from Random Exploration")
);
private final Map<String, CategoryItem> categoryItemMap =
categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity()));
@Inject
public SearchFlagSiteService(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public List<CategoryItem> getCategories() {
return categories;
}
public List<FlagSiteComplaintModel> getExistingComplaints(int id) throws SQLException {
try (var conn = dataSource.getConnection();
var complaintsStmt = conn.prepareStatement("""
SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION
FROM DOMAIN_COMPLAINT
WHERE DOMAIN_ID=?
"""))
{
List<FlagSiteComplaintModel> complaints = new ArrayList<>();
complaintsStmt.setInt(1, id);
ResultSet rs = complaintsStmt.executeQuery();
while (rs.next()) {
complaints.add(new FlagSiteComplaintModel(
categoryItemMap.getOrDefault(rs.getString(1), unknownCategory).categoryDesc,
rs.getString(2),
rs.getBoolean(3),
rs.getString(4)));
}
return complaints;
}
}
public void insertComplaint(FlagSiteFormData formData) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(
"""
INSERT INTO DOMAIN_COMPLAINT(DOMAIN_ID, CATEGORY, DESCRIPTION, SAMPLE) VALUES (?, ?, ?, ?)
""")) {
stmt.setInt(1, formData.domainId);
stmt.setString(2, formData.category);
stmt.setString(3, formData.description);
stmt.setString(4, formData.sampleQuery);
stmt.executeUpdate();
}
}
public record CategoryItem(String categoryName, String categoryDesc) {}
public record FlagSiteComplaintModel(String category, String submitTime, boolean isReviewed, String decision) {}
public record FlagSiteFormData(int domainId, String category, String description, String sampleQuery) {}
}

View File

@ -0,0 +1,117 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.svc.SearchQueryCountService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.sql.SQLException;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
/** Renders the front page (index) */
@Singleton
public class SearchFrontPageService {
private final MustacheRenderer<IndexModel> template;
private final HikariDataSource dataSource;
private final SearchQueryCountService searchVisitorCount;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchFrontPageService(RendererFactory rendererFactory,
HikariDataSource dataSource,
SearchQueryCountService searchVisitorCount
) throws IOException {
this.template = rendererFactory.renderer("search/index/index");
this.dataSource = dataSource;
this.searchVisitorCount = searchVisitorCount;
}
public String render(Request request, Response response) {
response.header("Cache-control", "public,max-age=3600");
return template.render(new IndexModel(
getNewsItems(),
searchVisitorCount.getQueriesPerMinute()
));
}
private List<NewsItem> getNewsItems() {
List<NewsItem> items = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT TITLE, LINK, SOURCE, LIST_DATE FROM SEARCH_NEWS_FEED ORDER BY LIST_DATE DESC
""")) {
var rep = stmt.executeQuery();
while (rep.next()) {
items.add(new NewsItem(
rep.getString(1),
rep.getString(2),
rep.getString(3),
rep.getDate(4).toLocalDate()));
}
}
catch (SQLException ex) {
logger.warn("Failed to fetch news items", ex);
}
return items;
}
public Object renderNewsFeed(Request request, Response response) {
List<NewsItem> newsItems = getNewsItems();
StringBuilder sb = new StringBuilder();
sb.append("""
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Marginalia Search News and Mentions</title>
<link>https://search.marginalia.nu/</link>
<description>News and Mentions of Marginalia Search</description>
<language>en-us</language>
<ttl>60</ttl>
""");
sb.append("<lastBuildDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</lastBuildDate>\n");
sb.append("<pubDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
sb.append("<ttl>60</ttl>\n");
for (var item : newsItems) {
sb.append("<item>\n");
sb.append("<title>").append(item.title()).append("</title>\n");
sb.append("<link>").append(item.url()).append("</link>\n");
if (item.source != null) {
sb.append("<author>").append(item.source()).append("</author>\n");
}
sb.append("<pubDate>").append(item.date().atStartOfDay().atZone(ZoneId.systemDefault()).format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
sb.append("</item>\n");
}
sb.append("</channel>\n");
sb.append("</rss>\n");
response.type("application/rss+xml");
return sb.toString();
}
private record IndexModel(List<NewsItem> news, int searchPerMinute) { }
private record NewsItem(String title, String url, String source, LocalDate date) {}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.search.svc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Singleton;
import java.time.temporal.ChronoUnit;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/** Keeps per-minute statistics of queries */
@Singleton
public class SearchQueryCountService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final AtomicInteger lastMinuteQueries = new AtomicInteger();
private final TimeUnit minute = TimeUnit.of(ChronoUnit.MINUTES);
private volatile int queriesPerMinute;
public SearchQueryCountService() {
Thread updateThread = new Thread(this::updateQueriesPerMinute,
"SearchVisitorCountService::updateQueriesPerMinute");
updateThread.setDaemon(true);
updateThread.start();
}
/** Retreive the number of queries performed the minute before this one */
public int getQueriesPerMinute() {
return queriesPerMinute;
}
/** Update query statistics for presentation */
public void registerQuery() {
lastMinuteQueries.incrementAndGet();
}
private void updateQueriesPerMinute() {
try {
for (;;) {
queriesPerMinute = lastMinuteQueries.getAndSet(0);
minute.sleep(1);
}
} catch (InterruptedException e) {
logger.warn("Query counter thread was interrupted");
}
}
}

View File

@ -0,0 +1,62 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.CommandEvaluator;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.exceptions.RedirectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
public class SearchQueryService {
private final WebsiteUrl websiteUrl;
private final SearchErrorPageService errorPageService;
private final CommandEvaluator searchCommandEvaulator;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchQueryService(
WebsiteUrl websiteUrl,
SearchErrorPageService errorPageService,
CommandEvaluator searchCommandEvaulator) {
this.websiteUrl = websiteUrl;
this.errorPageService = errorPageService;
this.searchCommandEvaulator = searchCommandEvaulator;
}
public Object pathSearch(Request request, Response response) {
try {
return searchCommandEvaulator.eval(response, parseParameters(request));
}
catch (RedirectException ex) {
response.redirect(ex.newUrl);
}
catch (Exception ex) {
logger.error("Error", ex);
errorPageService.serveError(request, response);
}
return "";
}
private SearchParameters parseParameters(Request request) {
try {
final String queryParam = request.queryParams("query");
if (null == queryParam || queryParam.isBlank()) {
throw new RedirectException(websiteUrl.url());
}
return new SearchParameters(queryParam.trim(), request);
}
catch (Exception ex) {
// Bots keep sending bad requests, suppress the error otherwise it will
// fill up the logs.
throw new RedirectException(websiteUrl.url());
}
}
}

View File

@ -0,0 +1,416 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.api.feeds.RpcFeed;
import nu.marginalia.api.feeds.RpcFeedItem;
import nu.marginalia.api.livecapture.LiveCaptureClient;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
public class SearchSiteInfoService {
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
private final SearchOperator searchOperator;
private final DomainInfoClient domainInfoClient;
private final SearchFlagSiteService flagSiteService;
private final DbDomainQueries domainQueries;
private final MustacheRenderer<Object> renderer;
private final FeedsClient feedsClient;
private final LiveCaptureClient liveCaptureClient;
private final ScreenshotService screenshotService;
@Inject
public SearchSiteInfoService(SearchOperator searchOperator,
DomainInfoClient domainInfoClient,
RendererFactory rendererFactory,
SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries,
FeedsClient feedsClient,
LiveCaptureClient liveCaptureClient,
ScreenshotService screenshotService) throws IOException
{
this.searchOperator = searchOperator;
this.domainInfoClient = domainInfoClient;
this.flagSiteService = flagSiteService;
this.domainQueries = domainQueries;
this.renderer = rendererFactory.renderer("search/site-info/site-info");
this.feedsClient = feedsClient;
this.liveCaptureClient = liveCaptureClient;
this.screenshotService = screenshotService;
}
public Object handle(Request request, Response response) throws SQLException {
String domainName = request.params("site");
String view = request.queryParamOrDefault("view", "info");
if (null == domainName || domainName.isBlank()) {
return null;
}
var model = switch (view) {
case "links" -> listLinks(domainName);
case "docs" -> listDocs(domainName);
case "info" -> listInfo(domainName);
case "report" -> reportSite(domainName);
default -> listInfo(domainName);
};
return renderer.render(model);
}
public Object handlePost(Request request, Response response) throws SQLException {
String domainName = request.params("site");
String view = request.queryParamOrDefault("view", "info");
if (null == domainName || domainName.isBlank()) {
return null;
}
if (!view.equals("report"))
return null;
final int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
FlagSiteFormData formData = new FlagSiteFormData(
domainId,
request.queryParams("category"),
request.queryParams("description"),
request.queryParams("sampleQuery")
);
flagSiteService.insertComplaint(formData);
var complaints = flagSiteService.getExistingComplaints(domainId);
var model = new ReportDomain(domainName, domainId, complaints, List.of(), true);
return renderer.render(model);
}
private Object reportSite(String domainName) throws SQLException {
int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
var existingComplaints = flagSiteService.getExistingComplaints(domainId);
return new ReportDomain(domainName,
domainId,
existingComplaints,
flagSiteService.getCategories(),
false);
}
private Backlinks listLinks(String domainName) {
return new Backlinks(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doBacklinkSearch(domainName));
}
private SiteInfoWithContext listInfo(String domainName) {
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
final Future<DomainInformation> domainInfoFuture;
final Future<List<SimilarDomain>> similarSetFuture;
final Future<List<SimilarDomain>> linkingDomainsFuture;
final CompletableFuture<RpcFeed> feedItemsFuture;
String url = "https://" + domainName + "/";
boolean hasScreenshot = screenshotService.hasScreenshot(domainId);
if (domainId < 0) {
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
similarSetFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
}
else if (!domainInfoClient.isAccepting()) {
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
similarSetFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
}
else {
domainInfoFuture = domainInfoClient.domainInformation(domainId);
similarSetFuture = domainInfoClient.similarDomains(domainId, 25);
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
feedItemsFuture = feedsClient.getFeed(domainId);
}
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
if (!sampleResults.isEmpty()) {
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
}
var result = new SiteInfoWithContext(domainName,
domainId,
url,
hasScreenshot,
waitForFuture(domainInfoFuture, () -> createDummySiteInfo(domainName)),
waitForFuture(similarSetFuture, List::of),
waitForFuture(linkingDomainsFuture, List::of),
waitForFuture(feedItemsFuture.thenApply(FeedItems::new), () -> FeedItems.dummyValue(domainName)),
sampleResults
);
requestMissingScreenshots(result);
return result;
}
/** Request missing screenshots for the given site info */
private void requestMissingScreenshots(SiteInfoWithContext result) {
// Always request the main site screenshot, even if we already have it
// as this will make the live-capture do a staleness check and update
// as needed.
liveCaptureClient.requestScreengrab(result.domainId());
int requests = 1;
// Request screenshots for similar and linking domains only if they are absent
// also throttle the requests to at most 5 per view.
if (result.similar() != null) {
for (var similar : result.similar()) {
if (similar.screenshot()) {
continue;
}
if (++requests > 5) {
break;
}
liveCaptureClient.requestScreengrab(similar.domainId());
}
}
if (result.linking() != null) {
for (var linking : result.linking()) {
if (linking.screenshot()) {
continue;
}
if (++requests > 5) {
break;
}
liveCaptureClient.requestScreengrab(linking.domainId());
}
}
}
private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) {
try {
return future.get(250, TimeUnit.MILLISECONDS);
} catch (Exception e) {
logger.info("Failed to get domain data: {}", e.getMessage());
return fallback.get();
}
}
private DomainInformation createDummySiteInfo(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
private Docs listDocs(String domainName) {
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
return new Docs(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doSiteSearch(domainName, domainId, 100));
}
public record Docs(Map<String, Boolean> view,
String domain,
long domainId,
List<UrlDetails> results) {
public Docs(String domain, long domainId, List<UrlDetails> results) {
this(Map.of("docs", true), domain, domainId, results);
}
public String focusDomain() { return domain; }
public String query() { return "site:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
public record Backlinks(Map<String, Boolean> view, String domain, long domainId, List<UrlDetails> results) {
public Backlinks(String domain, long domainId, List<UrlDetails> results) {
this(Map.of("links", true), domain, domainId, results);
}
public String query() { return "links:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
public record SiteInfoWithContext(Map<String, Boolean> view,
Map<String, Boolean> domainState,
String domain,
int domainId,
String siteUrl,
boolean hasScreenshot,
DomainInformation domainInformation,
List<SimilarDomain> similar,
List<SimilarDomain> linking,
FeedItems feed,
List<UrlDetails> samples
) {
public SiteInfoWithContext(String domain,
int domainId,
String siteUrl,
boolean hasScreenshot,
DomainInformation domainInformation,
List<SimilarDomain> similar,
List<SimilarDomain> linking,
FeedItems feedInfo,
List<UrlDetails> samples
)
{
this(Map.of("info", true),
Map.of(domainInfoState(domainInformation), true),
domain,
domainId,
siteUrl,
hasScreenshot,
domainInformation,
similar,
linking,
feedInfo,
samples);
}
public String getLayout() {
// My CSS is too weak to handle this in CSS alone, so I guess we're doing layout in Java...
if (similar != null && similar.size() < 25) {
return "lopsided";
}
else if (feed != null && !feed.items().isEmpty()) {
return "lopsided";
}
else if (samples != null && !samples.isEmpty()) {
return "lopsided";
}
else {
return "balanced";
}
}
public String query() { return "site:" + domain; }
private static String domainInfoState(DomainInformation info) {
if (info.isBlacklisted()) {
return "blacklisted";
}
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
return "suggestForCrawling";
}
if (info.isInCrawlQueue()) {
return "inCrawlQueue";
}
if (info.isUnknownDomain()) {
return "unknownDomain";
}
else {
return "indexed";
}
}
public boolean isKnown() {
return domainId > 0;
}
}
public record FeedItem(String title, String date, String description, String url) {
public FeedItem(RpcFeedItem rpcFeedItem) {
this(rpcFeedItem.getTitle(),
rpcFeedItem.getDate(),
rpcFeedItem.getDescription(),
rpcFeedItem.getUrl());
}
public String pubDay() { // Extract the date from an ISO style date string
if (date.length() > 10) {
return date.substring(0, 10);
}
return date;
}
public String descriptionSafe() {
return description
.replace("<", "&lt;")
.replace(">", "&gt;");
}
}
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
public static FeedItems dummyValue(String domain) {
return new FeedItems(domain, "", "", List.of());
}
public FeedItems(RpcFeed rpcFeedItems) {
this(rpcFeedItems.getDomain(),
rpcFeedItems.getFeedUrl(),
rpcFeedItems.getUpdated(),
rpcFeedItems.getItemsList().stream().map(FeedItem::new).toList());
}
}
public record ReportDomain(
Map<String, Boolean> view,
String domain,
int domainId,
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
List<SearchFlagSiteService.CategoryItem> category,
boolean submitted)
{
public ReportDomain(String domain,
int domainId,
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
List<SearchFlagSiteService.CategoryItem> category,
boolean submitted) {
this(Map.of("report", true), domain, domainId, complaints, category, submitted);
}
public String query() { return "site:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
}

View File

@ -0,0 +1,73 @@
package nu.marginalia.search.svc;
import nu.marginalia.api.math.MathClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckForNull;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.util.Optional;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@Singleton
public class SearchUnitConversionService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)");
private final Predicate<String> evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate();
private final MathClient mathClient;
@Inject
public SearchUnitConversionService(MathClient mathClient) {
this.mathClient = mathClient;
}
public Optional<String> tryConversion(String query) {
var matcher = conversionPattern.matcher(query);
if (!matcher.matches())
return Optional.empty();
String value = matcher.group(1);
String from = matcher.group(3);
String to = matcher.group(4);
logger.info("{} -> '{}' '{}' '{}'", query, value, from, to);
try {
var resultFuture = mathClient.unitConversion(value, from, to);
return Optional.of(
resultFuture.get(250, TimeUnit.MILLISECONDS)
);
} catch (ExecutionException e) {
logger.error("Error in unit conversion", e);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting for unit conversion", e);
} catch (TimeoutException e) {
// Ignore
}
return Optional.empty();
}
public @CheckForNull Future<String> tryEval(String query) {
if (!evalPredicate.test(query)) {
return null;
}
var expr = query.toLowerCase().trim();
if (expr.chars().allMatch(Character::isDigit)) {
return null;
}
logger.info("eval({})", expr);
return mathClient.evalMath(expr);
}
}

View File

@ -0,0 +1,3 @@
# Search Service
This is the old search service that search traffic with the old GUI.

View File

@ -280,6 +280,7 @@ public class SearchServicePaperDoll extends AbstractModule {
true,
true,
true,
true,
SimilarDomain.LinkType.FOWARD
));
dummyLinks.add(new SimilarDomain(
@ -290,6 +291,7 @@ public class SearchServicePaperDoll extends AbstractModule {
false,
false,
true,
true,
SimilarDomain.LinkType.BACKWARD
));
dummyLinks.add(new SimilarDomain(
@ -300,6 +302,7 @@ public class SearchServicePaperDoll extends AbstractModule {
false,
false,
false,
false,
SimilarDomain.LinkType.BIDIRECTIONAL
));

Some files were not shown because too many files have changed in this diff Show More