From 1b27c5cf068076621b1337ca97daff938a9a9fa0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 2 Jan 2025 18:02:17 +0100 Subject: [PATCH] (search) Add a copy of the old UI as a separate service, `search-service-legacy` --- .../search-service-legacy/build.gradle | 94 ++ .../search/SearchHandlebarsConfigurator.java | 12 + .../java/nu/marginalia/search/SearchMain.java | 47 + .../nu/marginalia/search/SearchModule.java | 20 + .../nu/marginalia/search/SearchOperator.java | 266 ++++++ .../search/SearchQueryParamFactory.java | 104 +++ .../search/SearchResultClusterer.java | 53 ++ .../nu/marginalia/search/SearchService.java | 128 +++ .../search/command/CommandEvaluator.java | 43 + .../search/command/SearchAdtechParameter.java | 29 + .../command/SearchCommandInterface.java | 10 + .../search/command/SearchJsParameter.java | 31 + .../search/command/SearchParameters.java | 106 +++ .../search/command/SearchRecentParameter.java | 21 + .../search/command/SearchTitleParameter.java | 21 + .../search/command/commands/BangCommand.java | 104 +++ .../command/commands/BrowseCommand.java | 78 ++ .../command/commands/ConvertCommand.java | 36 + .../command/commands/DefinitionCommand.java | 70 ++ .../command/commands/SearchCommand.java | 39 + .../command/commands/SiteRedirectCommand.java | 50 ++ .../search/db/DbNearDomainsQuery.java | 66 ++ .../search/exceptions/RedirectException.java | 14 + .../search/model/ClusteredUrlDetails.java | 102 +++ .../search/model/DecoratedSearchResults.java | 186 ++++ .../search/model/SearchFilters.java | 223 +++++ .../search/model/SearchProfile.java | 105 +++ .../marginalia/search/model/UrlDetails.java | 293 ++++++ .../search/results/BrowseResultCleaner.java | 27 + .../search/results/UrlDeduplicator.java | 69 ++ .../svc/SearchAddToCrawlQueueService.java | 69 ++ .../search/svc/SearchBrowseService.java | 87 ++ .../search/svc/SearchCrosstalkService.java | 69 ++ .../search/svc/SearchErrorPageService.java | 47 + .../search/svc/SearchFlagSiteService.java | 85 ++ .../search/svc/SearchFrontPageService.java | 117 +++ .../search/svc/SearchQueryCountService.java | 48 + .../search/svc/SearchQueryIndexService.java | 0 .../search/svc/SearchQueryService.java | 62 ++ .../search/svc/SearchSiteInfoService.java | 416 +++++++++ .../svc/SearchUnitConversionService.java | 73 ++ .../search-service-legacy/readme.md | 3 + .../resources/static/search/crawler-ips.txt | 14 + .../resources/static/search/favicon.ico | Bin 0 -> 1211 bytes .../resources/static/search/main.js | 13 + .../resources/static/search/menu.js | 91 ++ .../resources/static/search/opensearch.xml | 15 + .../resources/static/search/robots.txt | 8 + .../resources/static/search/rss.svg | 17 + .../resources/static/search/serp.scss | 831 ++++++++++++++++++ .../resources/static/search/theme.js | 57 ++ .../resources/static/search/tts.js | 112 +++ .../templates/search/browse-result.hdb | 12 + .../templates/search/browse-results.hdb | 34 + .../templates/search/conversion-results.hdb | 23 + .../templates/search/dictionary-results.hdb | 40 + .../templates/search/error-page-search.hdb | 24 + .../resources/templates/search/error-page.hdb | 20 + .../templates/search/index/index-about.hdb | 22 + .../templates/search/index/index-news.hdb | 17 + .../templates/search/index/index-redesign.hdb | 14 + .../templates/search/index/index-tips.hdb | 21 + .../templates/search/index/index.hdb | 31 + .../templates/search/parts/search-filters.hdb | 46 + .../templates/search/parts/search-footer.hdb | 124 +++ .../templates/search/parts/search-form.hdb | 18 + .../templates/search/parts/search-header.hdb | 21 + .../search/parts/search-result-rest.hdb | 32 + .../templates/search/parts/search-result.hdb | 22 + .../templates/search/search-results.hdb | 75 ++ .../search/site-info/site-crosstalk.hdb | 40 + .../search/site-info/site-info-feed.hdb | 22 + .../site-info/site-info-index-blacklisted.hdb | 8 + .../site-info/site-info-index-indexed.hdb | 13 + .../site-info/site-info-index-suggest.hdb | 12 + .../site-info/site-info-index-unknown.hdb | 9 + .../search/site-info/site-info-index.hdb | 23 + .../search/site-info/site-info-links.hdb | 7 + .../search/site-info/site-info-report.hdb | 60 ++ .../search/site-info/site-info-summary.hdb | 124 +++ .../templates/search/site-info/site-info.hdb | 58 ++ .../command/commands/BangCommandTest.java | 52 ++ .../paperdoll/SearchServicePaperDoll.java | 359 ++++++++ .../marginalia/util/TestLanguageModels.java | 37 + .../nu/marginalia/search/SearchService.java | 83 +- settings.gradle | 1 + 86 files changed, 6103 insertions(+), 82 deletions(-) create mode 100644 code/services-application/search-service-legacy/build.gradle create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchHandlebarsConfigurator.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchMain.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchModule.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchOperator.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchQueryParamFactory.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchResultClusterer.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/SearchService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/CommandEvaluator.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchAdtechParameter.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchCommandInterface.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchJsParameter.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchParameters.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchRecentParameter.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchTitleParameter.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BangCommand.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BrowseCommand.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/ConvertCommand.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/DefinitionCommand.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SearchCommand.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/db/DbNearDomainsQuery.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/exceptions/RedirectException.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/model/ClusteredUrlDetails.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/model/DecoratedSearchResults.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchFilters.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchProfile.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/model/UrlDetails.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/results/BrowseResultCleaner.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/results/UrlDeduplicator.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchBrowseService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchCrosstalkService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchErrorPageService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFlagSiteService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFrontPageService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryCountService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryIndexService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchSiteInfoService.java create mode 100644 code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchUnitConversionService.java create mode 100644 code/services-application/search-service-legacy/readme.md create mode 100644 code/services-application/search-service-legacy/resources/static/search/crawler-ips.txt create mode 100644 code/services-application/search-service-legacy/resources/static/search/favicon.ico create mode 100644 code/services-application/search-service-legacy/resources/static/search/main.js create mode 100644 code/services-application/search-service-legacy/resources/static/search/menu.js create mode 100644 code/services-application/search-service-legacy/resources/static/search/opensearch.xml create mode 100644 code/services-application/search-service-legacy/resources/static/search/robots.txt create mode 100644 code/services-application/search-service-legacy/resources/static/search/rss.svg create mode 100644 code/services-application/search-service-legacy/resources/static/search/serp.scss create mode 100644 code/services-application/search-service-legacy/resources/static/search/theme.js create mode 100644 code/services-application/search-service-legacy/resources/static/search/tts.js create mode 100644 code/services-application/search-service-legacy/resources/templates/search/browse-result.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/browse-results.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/conversion-results.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/dictionary-results.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/error-page-search.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/error-page.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/index/index-about.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/index/index-news.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/index/index-redesign.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/index/index-tips.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/index/index.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/parts/search-filters.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/parts/search-footer.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/parts/search-form.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/parts/search-header.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/parts/search-result-rest.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/parts/search-result.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/search-results.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-crosstalk.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-feed.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-blacklisted.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-indexed.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-suggest.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-unknown.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-links.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-report.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-summary.hdb create mode 100644 code/services-application/search-service-legacy/resources/templates/search/site-info/site-info.hdb create mode 100644 code/services-application/search-service-legacy/test/nu/marginalia/search/command/commands/BangCommandTest.java create mode 100644 code/services-application/search-service-legacy/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java create mode 100644 code/services-application/search-service-legacy/test/nu/marginalia/util/TestLanguageModels.java diff --git a/code/services-application/search-service-legacy/build.gradle b/code/services-application/search-service-legacy/build.gradle new file mode 100644 index 00000000..d71bb6f9 --- /dev/null +++ b/code/services-application/search-service-legacy/build.gradle @@ -0,0 +1,94 @@ +plugins { + id 'java' + id 'io.freefair.sass-base' version '8.4' + id 'io.freefair.sass-java' version '8.4' + id 'application' + id 'jvm-test-suite' + + id 'com.google.cloud.tools.jib' version '3.4.3' +} + +application { + mainClass = 'nu.marginalia.search.SearchMain' + applicationName = 'search-service-legacy' +} + +tasks.distZip.enabled = false + + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + } +} +sass { + sourceMapEnabled = true + sourceMapEmbed = true + outputStyle = EXPANDED +} + +apply from: "$rootProject.projectDir/srcsets.gradle" +apply from: "$rootProject.projectDir/docker.gradle" + +dependencies { + implementation project(':code:common:db') + implementation project(':code:common:model') + implementation project(':code:common:service') + implementation project(':code:common:config') + implementation project(':code:index:query') + + implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:braille-block-punch-cards') + implementation project(':code:libraries:term-frequency-dict') + + implementation project(':code:functions:live-capture:api') + implementation project(':code:functions:math:api') + implementation project(':code:functions:domain-info:api') + implementation project(':code:functions:search-query:api') + + + implementation project(':code:index:api') + implementation project(':code:common:renderer') + + implementation project(':code:features-search:screenshots') + implementation project(':code:features-search:random-websites') + + implementation libs.bundles.slf4j + + implementation libs.roaringbitmap + implementation libs.prometheus + implementation libs.notnull + implementation libs.guava + implementation dependencies.create(libs.guice.get()) { + exclude group: 'com.google.guava' + } + implementation libs.handlebars + implementation dependencies.create(libs.spark.get()) { + exclude group: 'org.eclipse.jetty' + } + implementation libs.bundles.jetty + implementation libs.opencsv + implementation libs.trove + implementation libs.fastutil + implementation libs.bundles.gson + implementation libs.bundles.mariadb + implementation libs.bundles.nlp + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + + testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec + testImplementation 'org.testcontainers:mariadb:1.17.4' + testImplementation 'org.testcontainers:junit-jupiter:1.17.4' + testImplementation project(':code:libraries:test-helpers') +} + +tasks.register('paperDoll', Test) { + useJUnitPlatform { + includeTags "paperdoll" + } + jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ] +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchHandlebarsConfigurator.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchHandlebarsConfigurator.java new file mode 100644 index 00000000..8e9f38eb --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchHandlebarsConfigurator.java @@ -0,0 +1,12 @@ +package nu.marginalia.search; + +import com.github.jknack.handlebars.Handlebars; +import nu.marginalia.renderer.config.HandlebarsConfigurator; + +public class SearchHandlebarsConfigurator implements HandlebarsConfigurator { + + @Override + public void configure(Handlebars handlebars) { + + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchMain.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchMain.java new file mode 100644 index 00000000..37b9893d --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchMain.java @@ -0,0 +1,47 @@ +package nu.marginalia.search; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import com.google.inject.Injector; +import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.service.module.ServiceDiscoveryModule; +import nu.marginalia.service.ServiceId; +import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.service.server.Initialization; +import spark.Spark; + +public class SearchMain extends MainClass { + private final SearchService service; + + @Inject + public SearchMain(SearchService service) { + this.service = service; + } + + public static void main(String... args) { + + init(ServiceId.Search, args); + + Spark.staticFileLocation("/static/search/"); + + Injector injector = Guice.createInjector( + new SearchModule(), + new ServiceConfigurationModule(ServiceId.Search), + new ServiceDiscoveryModule(), + new DatabaseModule(false) + ); + + + // Orchestrate the boot order for the services + var registry = injector.getInstance(ServiceRegistryIf.class); + var configuration = injector.getInstance(ServiceConfiguration.class); + orchestrateBoot(registry, configuration); + + injector.getInstance(SearchMain.class); + injector.getInstance(Initialization.class).setReady(); + + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchModule.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchModule.java new file mode 100644 index 00000000..5e55aa2a --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchModule.java @@ -0,0 +1,20 @@ +package nu.marginalia.search; + +import com.google.inject.AbstractModule; +import nu.marginalia.LanguageModels; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.WmsaHome; +import nu.marginalia.renderer.config.HandlebarsConfigurator; + +public class SearchModule extends AbstractModule { + + public void configure() { + bind(HandlebarsConfigurator.class).to(SearchHandlebarsConfigurator.class); + + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + + bind(WebsiteUrl.class).toInstance(new WebsiteUrl( + System.getProperty("search.websiteUrl", "https://search.marginalia.nu/"))); + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchOperator.java new file mode 100644 index 00000000..9a86db64 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchOperator.java @@ -0,0 +1,266 @@ +package nu.marginalia.search; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.api.math.MathClient; +import nu.marginalia.api.searchquery.QueryClient; +import nu.marginalia.api.searchquery.model.query.QueryResponse; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; +import nu.marginalia.bbpc.BrailleBlockPunchCards; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.model.ClusteredUrlDetails; +import nu.marginalia.search.model.DecoratedSearchResults; +import nu.marginalia.search.model.SearchFilters; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.results.UrlDeduplicator; +import nu.marginalia.search.svc.SearchQueryCountService; +import nu.marginalia.search.svc.SearchUnitConversionService; +import org.apache.logging.log4j.util.Strings; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.slf4j.Marker; +import org.slf4j.MarkerFactory; + +import javax.annotation.Nullable; +import java.time.Duration; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; +import java.util.stream.IntStream; + +@Singleton +public class SearchOperator { + + private static final Logger logger = LoggerFactory.getLogger(SearchOperator.class); + + // Marker for filtering out sensitive content from the persistent logs + private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); + + private final MathClient mathClient; + private final DbDomainQueries domainQueries; + private final QueryClient queryClient; + private final SearchQueryParamFactory paramFactory; + private final WebsiteUrl websiteUrl; + private final SearchUnitConversionService searchUnitConversionService; + private final SearchQueryCountService searchVisitorCount; + + + @Inject + public SearchOperator(MathClient mathClient, + DbDomainQueries domainQueries, + QueryClient queryClient, + SearchQueryParamFactory paramFactory, + WebsiteUrl websiteUrl, + SearchUnitConversionService searchUnitConversionService, + SearchQueryCountService searchVisitorCount + ) + { + + this.mathClient = mathClient; + this.domainQueries = domainQueries; + this.queryClient = queryClient; + this.paramFactory = paramFactory; + this.websiteUrl = websiteUrl; + this.searchUnitConversionService = searchUnitConversionService; + this.searchVisitorCount = searchVisitorCount; + } + + public List doSiteSearch(String domain, + int domainId, + int count) { + + var queryParams = paramFactory.forSiteSearch(domain, domainId, count); + var queryResponse = queryClient.search(queryParams); + + return getResultsFromQuery(queryResponse); + } + + public List doBacklinkSearch(String domain) { + + var queryParams = paramFactory.forBacklinkSearch(domain); + var queryResponse = queryClient.search(queryParams); + + return getResultsFromQuery(queryResponse); + } + + public List doLinkSearch(String source, String dest) { + var queryParams = paramFactory.forLinkSearch(source, dest); + var queryResponse = queryClient.search(queryParams); + + return getResultsFromQuery(queryResponse); + } + + public DecoratedSearchResults doSearch(SearchParameters userParams) throws InterruptedException { + // The full user-facing search query does additional work to try to evaluate the query + // e.g. as a unit conversion query. This is done in parallel with the regular search. + + Future eval = searchUnitConversionService.tryEval(userParams.query()); + + // Perform the regular search + + var queryParams = paramFactory.forRegularSearch(userParams); + QueryResponse queryResponse = queryClient.search(queryParams); + var queryResults = getResultsFromQuery(queryResponse); + + // Cluster the results based on the query response + List clusteredResults = SearchResultClusterer + .selectStrategy(queryResponse) + .clusterResults(queryResults, 25); + + // Log the query and results + + logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ',')); + logger.info(queryMarker, "Search Result Count: {}", queryResults.size()); + + // Get the evaluation result and other data to return to the user + String evalResult = getFutureOrDefault(eval, ""); + + String focusDomain = queryResponse.domain(); + int focusDomainId = focusDomain == null + ? -1 + : domainQueries.tryGetDomainId(new EdgeDomain(focusDomain)).orElse(-1); + + List problems = getProblems(evalResult, queryResults, queryResponse); + + List resultPages = IntStream.rangeClosed(1, queryResponse.totalPages()) + .mapToObj(number -> new DecoratedSearchResults.Page( + number, + number == userParams.page(), + userParams.withPage(number).renderUrl(websiteUrl) + )) + .toList(); + + // Return the results to the user + return DecoratedSearchResults.builder() + .params(userParams) + .problems(problems) + .evalResult(evalResult) + .results(clusteredResults) + .filters(new SearchFilters(websiteUrl, userParams)) + .focusDomain(focusDomain) + .focusDomainId(focusDomainId) + .resultPages(resultPages) + .build(); + } + + + public List getResultsFromQuery(QueryResponse queryResponse) { + final QueryLimits limits = queryResponse.specs().queryLimits; + final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); + + // Update the query count (this is what you see on the front page) + searchVisitorCount.registerQuery(); + + return queryResponse.results().stream() + .filter(deduplicator::shouldRetain) + .limit(limits.resultsTotal()) + .map(SearchOperator::createDetails) + .toList(); + } + + private static UrlDetails createDetails(DecoratedSearchResultItem item) { + return new UrlDetails( + item.documentId(), + item.domainId(), + cleanUrl(item.url), + item.title, + item.description, + item.format, + item.features, + DomainIndexingState.ACTIVE, + item.rankingScore, // termScore + item.resultsFromDomain, + BrailleBlockPunchCards.printBits(item.bestPositions, 64), + Long.bitCount(item.bestPositions), + item.rawIndexResult, + item.rawIndexResult.keywordScores + ); + } + + /** Replace nuisance domains with replacements where available */ + private static EdgeUrl cleanUrl(EdgeUrl url) { + String topdomain = url.domain.topDomain; + String subdomain = url.domain.subDomain; + String path = url.path; + + if (topdomain.equals("fandom.com")) { + int wikiIndex = path.indexOf("/wiki/"); + if (wikiIndex >= 0) { + return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null); + } + } + else if (topdomain.equals("medium.com")) { + if (!subdomain.isBlank()) { + return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null); + } + else { + String article = path.substring(path.indexOf("/", 1)); + return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null); + } + + } + return url; + } + + private List getProblems(String evalResult, List queryResults, QueryResponse response) throws InterruptedException { + + // We don't debug the query if it's a site search + if (response.domain() == null) + return List.of(); + + final List problems = new ArrayList<>(response.problems()); + + if (queryResults.size() <= 5 && null == evalResult) { + problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results."); + + // Try to spell check the search terms + var suggestions = getFutureOrDefault( + mathClient.spellCheck(response.searchTermsHuman()), + Map.of() + ); + + suggestions.forEach((term, suggestion) -> { + if (suggestion.size() > 1) { + String suggestionsStr = "\"%s\" could be spelled %s".formatted(term, suggestion.stream().map(s -> "\"" + s + "\"").collect(Collectors.joining(", "))); + problems.add(suggestionsStr); + } + }); + } + + Set representativeKeywords = response.getAllKeywords(); + if (representativeKeywords.size() > 1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning"))) + { + problems.add("Tip: Try using a query that looks like define:word if you want a dictionary definition"); + } + + return problems; + } + + private T getFutureOrDefault(@Nullable Future fut, T defaultValue) { + return getFutureOrDefault(fut, Duration.ofMillis(50), defaultValue); + } + + private T getFutureOrDefault(@Nullable Future fut, Duration timeout, T defaultValue) { + if (fut == null || fut.isCancelled()) { + return defaultValue; + } + try { + return fut.get(timeout.toMillis(), TimeUnit.MILLISECONDS); + } + catch (Exception ex) { + logger.warn("Error fetching eval result", ex); + return defaultValue; + } + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchQueryParamFactory.java new file mode 100644 index 00000000..6852423a --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -0,0 +1,104 @@ +package nu.marginalia.search; + +import nu.marginalia.api.searchquery.model.query.QueryParams; +import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.search.command.SearchParameters; + +import java.util.List; + +public class SearchQueryParamFactory { + + public QueryParams forRegularSearch(SearchParameters userParams) { + SearchQuery prototype = new SearchQuery(); + var profile = userParams.profile(); + + profile.addTacitTerms(prototype); + userParams.js().addTacitTerms(prototype); + userParams.adtech().addTacitTerms(prototype); + + return new QueryParams( + userParams.query(), + null, + prototype.searchTermsInclude, + prototype.searchTermsExclude, + prototype.searchTermsPriority, + prototype.searchTermsAdvice, + profile.getQualityLimit(), + profile.getYearLimit(), + profile.getSizeLimit(), + SpecificationLimit.none(), + List.of(), + new QueryLimits(5, 100, 200, 8192), + profile.searchSetIdentifier.name(), + userParams.strategy(), + userParams.temporalBias(), + userParams.page() + ); + + } + + public QueryParams forSiteSearch(String domain, int domainId, int count) { + return new QueryParams("site:"+domain, + null, + List.of(), + List.of(), + List.of(), + List.of(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + List.of(domainId), + new QueryLimits(count, count, 100, 512), + SearchSetIdentifier.NONE.name(), + QueryStrategy.AUTO, + ResultRankingParameters.TemporalBias.NONE, + 1 + ); + } + + public QueryParams forBacklinkSearch(String domain) { + return new QueryParams("links:"+domain, + null, + List.of(), + List.of(), + List.of(), + List.of(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + List.of(), + new QueryLimits(100, 100, 100, 512), + SearchSetIdentifier.NONE.name(), + QueryStrategy.AUTO, + ResultRankingParameters.TemporalBias.NONE, + 1 + ); + } + + public QueryParams forLinkSearch(String sourceDomain, String destDomain) { + return new QueryParams("site:" + sourceDomain + " links:" + destDomain, + null, + List.of(), + List.of(), + List.of(), + List.of(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + List.of(), + new QueryLimits(100, 100, 100, 512), + SearchSetIdentifier.NONE.name(), + QueryStrategy.AUTO, + ResultRankingParameters.TemporalBias.NONE, + 1 + ); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchResultClusterer.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchResultClusterer.java new file mode 100644 index 00000000..4e4cd086 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchResultClusterer.java @@ -0,0 +1,53 @@ +package nu.marginalia.search; + +import nu.marginalia.api.searchquery.model.query.QueryResponse; +import nu.marginalia.search.model.ClusteredUrlDetails; +import nu.marginalia.search.model.UrlDetails; + +import java.util.List; +import java.util.stream.Collectors; + +/** Functions for clustering search results */ +public class SearchResultClusterer { + private SearchResultClusterer() {} + + public interface SearchResultClusterStrategy { + List clusterResults(List results, int total); + } + + public static SearchResultClusterStrategy selectStrategy(QueryResponse response) { + if (response.domain() != null && !response.domain().isBlank()) + return SearchResultClusterer::noOp; + + return SearchResultClusterer::byDomain; + } + + /** No clustering, just return the results as is */ + private static List noOp(List results, int total) { + if (results.isEmpty()) + return List.of(); + + return results.stream() + .map(ClusteredUrlDetails::new) + .toList(); + } + + /** Cluster the results by domain, and return the top "total" clusters + * sorted by the relevance of the best result + */ + private static List byDomain(List results, int total) { + if (results.isEmpty()) + return List.of(); + + return results.stream() + .collect( + Collectors.groupingBy(details -> details.domainId) + ) + .values().stream() + .map(ClusteredUrlDetails::new) + .sorted() + .limit(total) + .toList(); + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchService.java new file mode 100644 index 00000000..a6452cda --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/SearchService.java @@ -0,0 +1,128 @@ +package nu.marginalia.search; + +import com.google.inject.Inject; +import io.prometheus.client.Counter; +import io.prometheus.client.Histogram; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.search.svc.*; +import nu.marginalia.service.server.BaseServiceParams; +import nu.marginalia.service.server.SparkService; +import nu.marginalia.service.server.StaticResources; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; +import spark.Route; +import spark.Spark; + +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; + +public class SearchService extends SparkService { + + private final WebsiteUrl websiteUrl; + private final StaticResources staticResources; + + private static final Logger logger = LoggerFactory.getLogger(SearchService.class); + private static final Histogram wmsa_search_service_request_time = Histogram.build() + .name("wmsa_search_service_request_time") + .linearBuckets(0.05, 0.05, 15) + .labelNames("matchedPath", "method") + .help("Search service request time (seconds)") + .register(); + private static final Counter wmsa_search_service_error_count = Counter.build() + .name("wmsa_search_service_error_count") + .labelNames("matchedPath", "method") + .help("Search service error count") + .register(); + + @Inject + public SearchService(BaseServiceParams params, + WebsiteUrl websiteUrl, + StaticResources staticResources, + SearchFrontPageService frontPageService, + SearchErrorPageService errorPageService, + SearchAddToCrawlQueueService addToCrawlQueueService, + SearchSiteInfoService siteInfoService, + SearchCrosstalkService crosstalkService, + SearchQueryService searchQueryService) + throws Exception + { + super(params); + + this.websiteUrl = websiteUrl; + this.staticResources = staticResources; + + Spark.staticFiles.expireTime(600); + + SearchServiceMetrics.get("/search", searchQueryService::pathSearch); + + SearchServiceMetrics.get("/", frontPageService::render); + SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed); + SearchServiceMetrics.get("/:resource", this::serveStatic); + + SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling); + + SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir); + + SearchServiceMetrics.get("/site/:site", siteInfoService::handle); + SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost); + + SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle); + + Spark.exception(Exception.class, (e,p,q) -> { + logger.error("Error during processing", e); + wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc(); + errorPageService.serveError(p, q); + }); + + Spark.awaitInitialization(); + } + + + + /** Wraps a route with a timer and a counter */ + private static class SearchServiceMetrics implements Route { + private final Route delegatedRoute; + + static void get(String path, Route route) { + Spark.get(path, new SearchServiceMetrics(route)); + } + static void post(String path, Route route) { + Spark.post(path, new SearchServiceMetrics(route)); + } + + private SearchServiceMetrics(Route delegatedRoute) { + this.delegatedRoute = delegatedRoute; + } + + @Override + public Object handle(Request request, Response response) throws Exception { + return wmsa_search_service_request_time + .labels(request.matchedPath(), request.requestMethod()) + .time(() -> delegatedRoute.handle(request, response)); + } + } + + private Object serveStatic(Request request, Response response) { + String resource = request.params("resource"); + staticResources.serveStatic("search", resource, request, response); + return ""; + } + + private Object siteSearchRedir(Request request, Response response) { + final String site = request.params("site"); + final String searchTerms; + + if (request.splat().length == 0) searchTerms = ""; + else searchTerms = request.splat()[0]; + + final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim(); + final String profile = request.queryParamOrDefault("profile", "yolo"); + + response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile)); + + return ""; + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/CommandEvaluator.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/CommandEvaluator.java new file mode 100644 index 00000000..eb352a93 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/CommandEvaluator.java @@ -0,0 +1,43 @@ +package nu.marginalia.search.command; + +import com.google.inject.Inject; +import nu.marginalia.search.command.commands.*; +import spark.Response; + +import java.util.ArrayList; +import java.util.List; + +public class CommandEvaluator { + + private final List specialCommands = new ArrayList<>(); + private final SearchCommand defaultCommand; + + @Inject + public CommandEvaluator( + BrowseCommand browse, + ConvertCommand convert, + DefinitionCommand define, + BangCommand bang, + SiteRedirectCommand siteRedirect, + SearchCommand search + ) { + specialCommands.add(browse); + specialCommands.add(convert); + specialCommands.add(define); + specialCommands.add(bang); + specialCommands.add(siteRedirect); + + defaultCommand = search; + } + + public Object eval(Response response, SearchParameters parameters) { + for (var cmd : specialCommands) { + var maybe = cmd.process(response, parameters); + if (maybe.isPresent()) + return maybe.get(); + } + + return defaultCommand.process(response, parameters).orElse(""); + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchAdtechParameter.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchAdtechParameter.java new file mode 100644 index 00000000..ce3bf099 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchAdtechParameter.java @@ -0,0 +1,29 @@ +package nu.marginalia.search.command; + +import nu.marginalia.api.searchquery.model.query.SearchQuery; + +import javax.annotation.Nullable; +import java.util.Arrays; + +public enum SearchAdtechParameter { + DEFAULT("default"), + REDUCE("reduce", "special:ads", "special:affiliate"); + + public final String value; + public final String[] implictExcludeSearchTerms; + + SearchAdtechParameter(String value, String... implictExcludeSearchTerms) { + this.value = value; + this.implictExcludeSearchTerms = implictExcludeSearchTerms; + } + + public static SearchAdtechParameter parse(@Nullable String value) { + if (REDUCE.value.equals(value)) return REDUCE; + + return DEFAULT; + } + + public void addTacitTerms(SearchQuery subquery) { + subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchCommandInterface.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchCommandInterface.java new file mode 100644 index 00000000..d69bacbd --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchCommandInterface.java @@ -0,0 +1,10 @@ +package nu.marginalia.search.command; + + +import spark.Response; + +import java.util.Optional; + +public interface SearchCommandInterface { + Optional process(Response response, SearchParameters parameters); +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchJsParameter.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchJsParameter.java new file mode 100644 index 00000000..8cf6aada --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchJsParameter.java @@ -0,0 +1,31 @@ +package nu.marginalia.search.command; + +import nu.marginalia.api.searchquery.model.query.SearchQuery; + +import javax.annotation.Nullable; +import java.util.Arrays; + +public enum SearchJsParameter { + DEFAULT("default"), + DENY_JS("no-js", "js:true"), + REQUIRE_JS("yes-js", "js:false"); + + public final String value; + public final String[] implictExcludeSearchTerms; + + SearchJsParameter(String value, String... implictExcludeSearchTerms) { + this.value = value; + this.implictExcludeSearchTerms = implictExcludeSearchTerms; + } + + public static SearchJsParameter parse(@Nullable String value) { + if (DENY_JS.value.equals(value)) return DENY_JS; + if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS; + + return DEFAULT; + } + + public void addTacitTerms(SearchQuery subquery) { + subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchParameters.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchParameters.java new file mode 100644 index 00000000..c10d0092 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchParameters.java @@ -0,0 +1,106 @@ +package nu.marginalia.search.command; + +import nu.marginalia.WebsiteUrl; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.search.model.SearchProfile; +import spark.Request; + +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.Objects; + +import static nu.marginalia.search.command.SearchRecentParameter.RECENT; + +public record SearchParameters(String query, + SearchProfile profile, + SearchJsParameter js, + SearchRecentParameter recent, + SearchTitleParameter searchTitle, + SearchAdtechParameter adtech, + boolean newFilter, + int page + ) { + + public SearchParameters(String queryString, Request request) { + this( + queryString, + SearchProfile.getSearchProfile(request.queryParams("profile")), + SearchJsParameter.parse(request.queryParams("js")), + SearchRecentParameter.parse(request.queryParams("recent")), + SearchTitleParameter.parse(request.queryParams("searchTitle")), + SearchAdtechParameter.parse(request.queryParams("adtech")), + "true".equals(request.queryParams("newfilter")), + Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "1")) + ); + } + + public String profileStr() { + return profile.filterId; + } + + public SearchParameters withProfile(SearchProfile profile) { + return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page); + } + + public SearchParameters withJs(SearchJsParameter js) { + return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page); + } + public SearchParameters withAdtech(SearchAdtechParameter adtech) { + return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page); + } + + public SearchParameters withRecent(SearchRecentParameter recent) { + return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page); + } + + public SearchParameters withTitle(SearchTitleParameter title) { + return new SearchParameters(query, profile, js, recent, title, adtech, true, page); + } + + public SearchParameters withPage(int page) { + return new SearchParameters(query, profile, js, recent, searchTitle, adtech, false, page); + } + + public String renderUrl(WebsiteUrl baseUrl) { + String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d", + URLEncoder.encode(query, StandardCharsets.UTF_8), + URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8), + URLEncoder.encode(js.value, StandardCharsets.UTF_8), + URLEncoder.encode(adtech.value, StandardCharsets.UTF_8), + URLEncoder.encode(recent.value, StandardCharsets.UTF_8), + URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8), + Boolean.valueOf(newFilter).toString(), + page + ); + + return baseUrl.withPath(path); + } + + public ResultRankingParameters.TemporalBias temporalBias() { + if (recent == RECENT) { + return ResultRankingParameters.TemporalBias.RECENT; + } + else if (profile == SearchProfile.VINTAGE) { + return ResultRankingParameters.TemporalBias.OLD; + } + + return ResultRankingParameters.TemporalBias.NONE; + } + + public QueryStrategy strategy() { + if (searchTitle == SearchTitleParameter.TITLE) { + return QueryStrategy.REQUIRE_FIELD_TITLE; + } + + return QueryStrategy.AUTO; + } + + public SpecificationLimit yearLimit() { + if (recent == RECENT) + return SpecificationLimit.greaterThan(2018); + + return profile.getYearLimit(); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchRecentParameter.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchRecentParameter.java new file mode 100644 index 00000000..9b1223b5 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchRecentParameter.java @@ -0,0 +1,21 @@ +package nu.marginalia.search.command; + +import javax.annotation.Nullable; + +public enum SearchRecentParameter { + DEFAULT("default"), + RECENT("recent"); + + public final String value; + + SearchRecentParameter(String value) { + this.value = value; + } + + public static SearchRecentParameter parse(@Nullable String value) { + if (RECENT.value.equals(value)) return RECENT; + + return DEFAULT; + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchTitleParameter.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchTitleParameter.java new file mode 100644 index 00000000..ca1f4ccb --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/SearchTitleParameter.java @@ -0,0 +1,21 @@ +package nu.marginalia.search.command; + +import javax.annotation.Nullable; + +public enum SearchTitleParameter { + DEFAULT("default"), + TITLE("title"); + + public final String value; + + SearchTitleParameter(String value) { + this.value = value; + } + + public static SearchTitleParameter parse(@Nullable String value) { + if (TITLE.value.equals(value)) return TITLE; + + return DEFAULT; + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BangCommand.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BangCommand.java new file mode 100644 index 00000000..049456e7 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BangCommand.java @@ -0,0 +1,104 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.exceptions.RedirectException; +import spark.Response; + +import java.net.URLEncoder; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; + +public class BangCommand implements SearchCommandInterface { + private final Map bangsToPattern = new HashMap<>(); + + @Inject + public BangCommand() + { + bangsToPattern.put("!g", "https://www.google.com/search?q=%s"); + bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s"); + bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki"); + } + + @Override + public Optional process(Response response, SearchParameters parameters) { + + for (var entry : bangsToPattern.entrySet()) { + String bangPattern = entry.getKey(); + String redirectPattern = entry.getValue(); + + var match = matchBangPattern(parameters.query(), bangPattern); + + if (match.isPresent()) { + var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8)); + throw new RedirectException(url); + } + } + + return Optional.empty(); + } + + /** If the query contains the bang pattern bangKey, return the query with the bang pattern removed. */ + Optional matchBangPattern(String query, String bangKey) { + var bm = new BangMatcher(query); + + while (bm.findNext(bangKey)) { + + if (!bm.isRelativeSpaceOrInvalid(-1)) + continue; + if (!bm.isRelativeSpaceOrInvalid(bangKey.length())) + continue; + + String prefix = bm.prefix().trim(); + String suffix = bm.suffix(bangKey.length()).trim(); + + String ret = (prefix + " " + suffix).trim(); + + return Optional.of(ret) + .filter(s -> !s.isBlank()); + } + + return Optional.empty(); + } + + private static class BangMatcher { + private final String str; + private int pos; + + public String prefix() { + return str.substring(0, pos); + } + + public String suffix(int offset) { + if (pos+offset < str.length()) + return str.substring(pos + offset); + return ""; + } + + public BangMatcher(String str) { + this.str = str; + this.pos = -1; + } + + public boolean findNext(String pattern) { + if (pos + 1 >= str.length()) + return false; + + return (pos = str.indexOf(pattern, pos + 1)) >= 0; + } + + public boolean isRelativeSpaceOrInvalid(int offset) { + if (offset + pos < 0) + return true; + if (offset + pos >= str.length()) + return true; + + return Character.isSpaceChar(str.charAt(offset + pos)); + } + + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BrowseCommand.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BrowseCommand.java new file mode 100644 index 00000000..a889ec3d --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/BrowseCommand.java @@ -0,0 +1,78 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.browse.model.BrowseResultSet; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.svc.SearchBrowseService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Response; + +import java.io.IOException; +import java.util.Map; +import java.util.Optional; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class BrowseCommand implements SearchCommandInterface { + private final SearchBrowseService browseService; + private final MustacheRenderer browseResultsRenderer; + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Predicate queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9:]+$").asPredicate(); + + @Inject + public BrowseCommand(SearchBrowseService browseService, + RendererFactory rendererFactory) + throws IOException + { + this.browseService = browseService; + + browseResultsRenderer = rendererFactory.renderer("search/browse-results"); + } + + @Override + public Optional process(Response response, SearchParameters parameters) { + if (!queryPatternPredicate.test(parameters.query())) { + return Optional.empty(); + } + + var model = browseSite(parameters.query()); + + if (null == model) + return Optional.empty(); + + return Optional.of(browseResultsRenderer.render(model, + Map.of("query", parameters.query(), + "profile", parameters.profileStr(), + "focusDomain", model.focusDomain()) + )); + } + + + private BrowseResultSet browseSite(String humanQuery) { + String definePrefix = "browse:"; + String word = humanQuery.substring(definePrefix.length()).toLowerCase(); + + try { + if ("random".equals(word)) { + return browseService.getRandomEntries(0); + } + if (word.startsWith("random:")) { + int set = Integer.parseInt(word.split(":")[1]); + return browseService.getRandomEntries(set); + } + else { + return browseService.getRelatedEntries(word); + } + } + catch (Exception ex) { + logger.info("No Results"); + return null; + } + } + + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/ConvertCommand.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/ConvertCommand.java new file mode 100644 index 00000000..17780f48 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/ConvertCommand.java @@ -0,0 +1,36 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.svc.SearchUnitConversionService; +import spark.Response; + +import java.io.IOException; +import java.util.Map; +import java.util.Optional; + +public class ConvertCommand implements SearchCommandInterface { + private final SearchUnitConversionService searchUnitConversionService; + private final MustacheRenderer> conversionRenderer; + + @Inject + public ConvertCommand(SearchUnitConversionService searchUnitConversionService, RendererFactory rendererFactory) throws IOException { + this.searchUnitConversionService = searchUnitConversionService; + + conversionRenderer = rendererFactory.renderer("search/conversion-results"); + } + + @Override + public Optional process(Response response, SearchParameters parameters) { + var conversion = searchUnitConversionService.tryConversion(parameters.query()); + return conversion.map(s -> conversionRenderer.render(Map.of( + "query", parameters.query(), + "result", s, + "profile", parameters.profileStr()) + )); + + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/DefinitionCommand.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/DefinitionCommand.java new file mode 100644 index 00000000..3025497f --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/DefinitionCommand.java @@ -0,0 +1,70 @@ + +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.api.math.MathClient; +import nu.marginalia.api.math.model.DictionaryResponse; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.renderer.RendererFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Response; + +import java.io.IOException; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.TimeUnit; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class DefinitionCommand implements SearchCommandInterface { + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final MustacheRenderer dictionaryRenderer; + private final MathClient mathClient; + + + private final Predicate queryPatternPredicate = Pattern.compile("^define:[A-Za-z\\s-0-9]+$").asPredicate(); + + @Inject + public DefinitionCommand(RendererFactory rendererFactory, MathClient mathClient) + throws IOException + { + + dictionaryRenderer = rendererFactory.renderer("search/dictionary-results"); + this.mathClient = mathClient; + } + + @Override + public Optional process(Response response, SearchParameters parameters) { + if (!queryPatternPredicate.test(parameters.query())) { + return Optional.empty(); + } + + var results = lookupDefinition(parameters.query()); + + return Optional.of(dictionaryRenderer.render(results, + Map.of("query", parameters.query(), + "profile", parameters.profileStr()) + )); + } + + + private DictionaryResponse lookupDefinition(String humanQuery) { + String definePrefix = "define:"; + String word = humanQuery.substring(definePrefix.length()).toLowerCase(); + + try { + return mathClient + .dictionaryLookup(word) + .get(250, TimeUnit.MILLISECONDS); + } + catch (Exception e) { + logger.error("Failed to lookup definition for word: " + word, e); + + throw new RuntimeException(e); + } + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SearchCommand.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SearchCommand.java new file mode 100644 index 00000000..6684bf74 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SearchCommand.java @@ -0,0 +1,39 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.search.SearchOperator; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.model.DecoratedSearchResults; +import spark.Response; + +import java.io.IOException; +import java.util.Optional; + +public class SearchCommand implements SearchCommandInterface { + private final SearchOperator searchOperator; + private final MustacheRenderer searchResultsRenderer; + + + @Inject + public SearchCommand(SearchOperator searchOperator, + RendererFactory rendererFactory) throws IOException { + this.searchOperator = searchOperator; + + searchResultsRenderer = rendererFactory.renderer("search/search-results"); + } + + @Override + public Optional process(Response response, SearchParameters parameters) { + try { + DecoratedSearchResults results = searchOperator.doSearch(parameters); + return Optional.of(searchResultsRenderer.render(results)); + } + catch (InterruptedException ex) { + Thread.currentThread().interrupt(); + return Optional.empty(); + } + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java new file mode 100644 index 00000000..902ed025 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/command/commands/SiteRedirectCommand.java @@ -0,0 +1,50 @@ +package nu.marginalia.search.command.commands; + +import com.google.inject.Inject; +import nu.marginalia.search.command.SearchCommandInterface; +import nu.marginalia.search.command.SearchParameters; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Response; + +import java.util.Optional; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +public class SiteRedirectCommand implements SearchCommandInterface { + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Predicate queryPatternPredicate = Pattern.compile("^(site|links):[.A-Za-z\\-0-9]+$").asPredicate(); + + @Inject + public SiteRedirectCommand() { + } + + @Override + public Optional process(Response response, SearchParameters parameters) { + if (!queryPatternPredicate.test(parameters.query())) { + return Optional.empty(); + } + + int idx = parameters.query().indexOf(':'); + String prefix = parameters.query().substring(0, idx); + String domain = parameters.query().substring(idx + 1).toLowerCase(); + + // Use an HTML redirect here, so we can use relative URLs + String view = switch (prefix) { + case "links" -> "links"; + default -> "info"; + }; + + return Optional.of(""" + + + + Redirecting... + + """.formatted(domain, view) + ); + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/db/DbNearDomainsQuery.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/db/DbNearDomainsQuery.java new file mode 100644 index 00000000..7e630823 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/db/DbNearDomainsQuery.java @@ -0,0 +1,66 @@ +package nu.marginalia.search.db; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; + +import java.sql.ResultSet; +import java.util.ArrayList; +import java.util.List; +import java.util.function.Consumer; + +public class DbNearDomainsQuery { + + private final HikariDataSource dataSource; + + @Inject + public DbNearDomainsQuery(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getRelatedDomains(String term, Consumer onProblem) { + List ret = new ArrayList<>(); + try (var conn = dataSource.getConnection(); + + var selfStmt = conn.prepareStatement(""" + SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=? + """); + var stmt = conn.prepareStatement(""" + SELECT NEIGHBOR_ID, ND.INDEXED, ND.STATE FROM EC_DOMAIN_NEIGHBORS_2 + INNER JOIN EC_DOMAIN ND ON ND.ID=NEIGHBOR_ID + WHERE DOMAIN_ID=? + """)) { + ResultSet rsp; + selfStmt.setString(1, term); + rsp = selfStmt.executeQuery(); + int domainId = -1; + if (rsp.next()) { + domainId = rsp.getInt(1); + ret.add(domainId); + } + + stmt.setInt(1, domainId); + rsp = stmt.executeQuery(); + + while (rsp.next()) { + int id = rsp.getInt(1); + int indexed = rsp.getInt(2); + String state = rsp.getString(3); + + if (indexed > 0 && ("ACTIVE".equalsIgnoreCase(state) || "SOCIAL_MEDIA".equalsIgnoreCase(state) || "SPECIAL".equalsIgnoreCase(state))) { + ret.add(id); + } + } + + } + catch (Exception ex) { + throw new RuntimeException(ex); + } + + if (ret.isEmpty()) { + onProblem.accept("Could not find domains adjacent " + term); + } + + return ret; + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/exceptions/RedirectException.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/exceptions/RedirectException.java new file mode 100644 index 00000000..eb04a4cb --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/exceptions/RedirectException.java @@ -0,0 +1,14 @@ +package nu.marginalia.search.exceptions; + +public class RedirectException extends RuntimeException { + public final String newUrl; + + public RedirectException(String newUrl) { + this.newUrl = newUrl; + } + + @Override + public StackTraceElement[] getStackTrace() { + return new StackTraceElement[0]; + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/ClusteredUrlDetails.java new file mode 100644 index 00000000..701a2c51 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -0,0 +1,102 @@ +package nu.marginalia.search.model; + +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.idx.WordFlags; +import org.jetbrains.annotations.NotNull; + +import java.util.*; + +/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result + * and the rest are additional results, for summary display. */ +public class ClusteredUrlDetails implements Comparable { + + @NotNull + public final UrlDetails first; + + @NotNull + public final List rest; + + /** Create a new ClusteredUrlDetails from a collection of UrlDetails, + * with the best result as "first", and the others, in descending order + * of quality as the "rest"... + * + * @param details A collection of UrlDetails, which must not be empty. + */ + public ClusteredUrlDetails(Collection details) { + var items = new ArrayList<>(details); + + items.sort(Comparator.naturalOrder()); + + if (items.isEmpty()) + throw new IllegalArgumentException("Empty list of details"); + + this.first = items.removeFirst(); + this.rest = items; + + double bestScore = first.termScore; + double scoreLimit = Math.min(4.0, bestScore * 1.25); + + this.rest.removeIf(urlDetail -> { + if (urlDetail.termScore > scoreLimit) + return false; + + for (var keywordScore : urlDetail.resultItem.keywordScores) { + if (keywordScore.isKeywordSpecial()) + continue; + if (keywordScore.hasTermFlag(WordFlags.Title)) + return false; + if (keywordScore.hasTermFlag(WordFlags.ExternalLink)) + return false; + if (keywordScore.hasTermFlag(WordFlags.UrlDomain)) + return false; + if (keywordScore.hasTermFlag(WordFlags.UrlPath)) + return false; + if (keywordScore.hasTermFlag(WordFlags.Subjects)) + return false; + } + + return true; + }); + + } + + + public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) { + this.first = onlyFirst; + this.rest = Collections.emptyList(); + } + + // For renderer use, do not remove + public @NotNull UrlDetails getFirst() { + return first; + } + + // For renderer use, do not remove + public @NotNull List getRest() { + return rest; + } + + + public EdgeDomain getDomain() { + return first.url.getDomain(); + } + + public boolean hasMultiple() { + return !rest.isEmpty(); + } + + /** Returns the total number of results from the same domain, + * including such results that are not included here. */ + public int totalCount() { + return first.resultsFromSameDomain; + } + + public int remainingCount() { + return totalCount() - 1 - rest.size(); + } + + @Override + public int compareTo(@NotNull ClusteredUrlDetails o) { + return Objects.compare(first, o.first, UrlDetails::compareTo); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/model/DecoratedSearchResults.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/DecoratedSearchResults.java new file mode 100644 index 00000000..87fc336a --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/DecoratedSearchResults.java @@ -0,0 +1,186 @@ +package nu.marginalia.search.model; + +import nu.marginalia.search.command.SearchParameters; + +import java.util.List; + +/** + * A class to hold details about the search results, + * as used by the handlebars templating engine to render + * the search results page. + */ +public class DecoratedSearchResults { + private final SearchParameters params; + private final List problems; + private final String evalResult; + + public DecoratedSearchResults(SearchParameters params, + List problems, + String evalResult, + List results, + String focusDomain, + int focusDomainId, + SearchFilters filters, + List resultPages) { + this.params = params; + this.problems = problems; + this.evalResult = evalResult; + this.results = results; + this.focusDomain = focusDomain; + this.focusDomainId = focusDomainId; + this.filters = filters; + this.resultPages = resultPages; + } + + public final List results; + + public static DecoratedSearchResultsBuilder builder() { + return new DecoratedSearchResultsBuilder(); + } + + public SearchParameters getParams() { + return params; + } + + public List getProblems() { + return problems; + } + + public String getEvalResult() { + return evalResult; + } + + public List getResults() { + return results; + } + + public String getFocusDomain() { + return focusDomain; + } + + public int getFocusDomainId() { + return focusDomainId; + } + + public SearchFilters getFilters() { + return filters; + } + + public List getResultPages() { + return resultPages; + } + + private final String focusDomain; + private final int focusDomainId; + private final SearchFilters filters; + + private final List resultPages; + + public boolean isMultipage() { + return resultPages.size() > 1; + } + + public record Page(int number, boolean current, String href) { + } + + // These are used by the search form, they look unused in the IDE but are used by the mustache template, + // DO NOT REMOVE THEM + public int getResultCount() { + return results.size(); + } + + public String getQuery() { + return params.query(); + } + + public String getProfile() { + return params.profile().filterId; + } + + public String getJs() { + return params.js().value; + } + + public String getAdtech() { + return params.adtech().value; + } + + public String getRecent() { + return params.recent().value; + } + + public String getSearchTitle() { + return params.searchTitle().value; + } + + public int page() { + return params.page(); + } + + public Boolean isNewFilter() { + return params.newFilter(); + } + + + public static class DecoratedSearchResultsBuilder { + private SearchParameters params; + private List problems; + private String evalResult; + private List results; + private String focusDomain; + private int focusDomainId; + private SearchFilters filters; + private List resultPages; + + DecoratedSearchResultsBuilder() { + } + + public DecoratedSearchResultsBuilder params(SearchParameters params) { + this.params = params; + return this; + } + + public DecoratedSearchResultsBuilder problems(List problems) { + this.problems = problems; + return this; + } + + public DecoratedSearchResultsBuilder evalResult(String evalResult) { + this.evalResult = evalResult; + return this; + } + + public DecoratedSearchResultsBuilder results(List results) { + this.results = results; + return this; + } + + public DecoratedSearchResultsBuilder focusDomain(String focusDomain) { + this.focusDomain = focusDomain; + return this; + } + + public DecoratedSearchResultsBuilder focusDomainId(int focusDomainId) { + this.focusDomainId = focusDomainId; + return this; + } + + public DecoratedSearchResultsBuilder filters(SearchFilters filters) { + this.filters = filters; + return this; + } + + public DecoratedSearchResultsBuilder resultPages(List resultPages) { + this.resultPages = resultPages; + return this; + } + + public DecoratedSearchResults build() { + return new DecoratedSearchResults(this.params, this.problems, this.evalResult, this.results, this.focusDomain, this.focusDomainId, this.filters, this.resultPages); + } + + public String toString() { + return "DecoratedSearchResults.DecoratedSearchResultsBuilder(params=" + this.params + ", problems=" + this.problems + ", evalResult=" + this.evalResult + ", results=" + this.results + ", focusDomain=" + this.focusDomain + ", focusDomainId=" + this.focusDomainId + ", filters=" + this.filters + ", resultPages=" + this.resultPages + ")"; + } + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchFilters.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchFilters.java new file mode 100644 index 00000000..bd46b7fa --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchFilters.java @@ -0,0 +1,223 @@ +package nu.marginalia.search.model; + +import nu.marginalia.WebsiteUrl; +import nu.marginalia.search.command.*; + +import java.util.List; + +/** Models the search filters displayed next to the search results */ +public class SearchFilters { + private final WebsiteUrl url; + + public final String currentFilter; + + // These are necessary for the renderer to access the data + public final RemoveJsOption removeJsOption; + public final ReduceAdtechOption reduceAdtechOption; + public final ShowRecentOption showRecentOption; + public final SearchTitleOption searchTitleOption; + + public final List> filterGroups; + + // Getters are for the renderer to access the data + + + public String getCurrentFilter() { + return currentFilter; + } + + public RemoveJsOption getRemoveJsOption() { + return removeJsOption; + } + + public ReduceAdtechOption getReduceAdtechOption() { + return reduceAdtechOption; + } + + public ShowRecentOption getShowRecentOption() { + return showRecentOption; + } + + public SearchTitleOption getSearchTitleOption() { + return searchTitleOption; + } + + public List> getFilterGroups() { + return filterGroups; + } + + public SearchFilters(WebsiteUrl url, SearchParameters parameters) { + this.url = url; + + removeJsOption = new RemoveJsOption(parameters); + reduceAdtechOption = new ReduceAdtechOption(parameters); + showRecentOption = new ShowRecentOption(parameters); + searchTitleOption = new SearchTitleOption(parameters); + + + currentFilter = parameters.profile().filterId; + + filterGroups = List.of( + List.of( + new Filter("No Filter", SearchProfile.NO_FILTER, parameters), +// new Filter("Popular", SearchProfile.POPULAR, parameters), + new Filter("Small Web", SearchProfile.SMALLWEB, parameters), + new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters), + new Filter("Academia", SearchProfile.ACADEMIA, parameters) + ), + List.of( + new Filter("Vintage", SearchProfile.VINTAGE, parameters), + new Filter("Plain Text", SearchProfile.PLAIN_TEXT, parameters), + new Filter("~tilde", SearchProfile.TILDE, parameters) + ), + List.of( + new Filter("Wiki", SearchProfile.WIKI, parameters), + new Filter("Forum", SearchProfile.FORUM, parameters), + new Filter("Docs", SearchProfile.DOCS, parameters), + new Filter("Recipes", SearchProfile.FOOD, parameters) + ) + ); + + + } + + public class RemoveJsOption { + private final SearchJsParameter value; + + public final String url; + public String getUrl() { + return url; + } + + public boolean isSet() { + return value.equals(SearchJsParameter.DENY_JS); + } + + public String name() { + return "Remove Javascript"; + } + + public RemoveJsOption(SearchParameters parameters) { + this.value = parameters.js(); + + var toggledValue = switch (parameters.js()) { + case DENY_JS -> SearchJsParameter.DEFAULT; + default -> SearchJsParameter.DENY_JS; + }; + + this.url = parameters.withJs(toggledValue).renderUrl(SearchFilters.this.url); + } + } + + public class ReduceAdtechOption { + private final SearchAdtechParameter value; + + public final String url; + public String getUrl() { + return url; + } + + public boolean isSet() { + return value.equals(SearchAdtechParameter.REDUCE); + } + + public String name() { + return "Reduce Adtech"; + } + + public ReduceAdtechOption(SearchParameters parameters) { + this.value = parameters.adtech(); + + var toggledValue = switch (parameters.adtech()) { + case REDUCE -> SearchAdtechParameter.DEFAULT; + default -> SearchAdtechParameter.REDUCE; + }; + + this.url = parameters.withAdtech(toggledValue).renderUrl(SearchFilters.this.url); + } + } + + public class ShowRecentOption { + private final SearchRecentParameter value; + + public final String url; + public String getUrl() { + return url; + } + + public boolean isSet() { + return value.equals(SearchRecentParameter.RECENT); + } + + public String name() { + return "Recent Results"; + } + + public ShowRecentOption(SearchParameters parameters) { + this.value = parameters.recent(); + + var toggledValue = switch (parameters.recent()) { + case RECENT -> SearchRecentParameter.DEFAULT; + default -> SearchRecentParameter.RECENT; + }; + + this.url = parameters.withRecent(toggledValue).renderUrl(SearchFilters.this.url); + } + } + + public class SearchTitleOption { + private final SearchTitleParameter value; + + public final String url; + public String getUrl() { + return url; + } + + public boolean isSet() { + return value.equals(SearchTitleParameter.TITLE); + } + + public String name() { + return "Search In Title"; + } + + public SearchTitleOption(SearchParameters parameters) { + this.value = parameters.searchTitle(); + + var toggledValue = switch (parameters.searchTitle()) { + case TITLE -> SearchTitleParameter.DEFAULT; + default -> SearchTitleParameter.TITLE; + }; + + this.url = parameters.withTitle(toggledValue).renderUrl(SearchFilters.this.url); + } + } + + public class Filter { + public final SearchProfile profile; + + public final String displayName; + public final boolean current; + public final String url; + + public Filter(String displayName, SearchProfile profile, SearchParameters parameters) { + this.displayName = displayName; + this.profile = profile; + this.current = profile.equals(parameters.profile()); + + this.url = parameters.withProfile(profile).renderUrl(SearchFilters.this.url); + } + + public String getDisplayName() { + return displayName; + } + + public boolean isCurrent() { + return current; + } + + public String getUrl() { + return url; + } + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchProfile.java new file mode 100644 index 00000000..955c3fcb --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/SearchProfile.java @@ -0,0 +1,105 @@ +package nu.marginalia.search.model; + +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; + +import java.util.Objects; + +public enum SearchProfile { + POPULAR("default", SearchSetIdentifier.POPULAR), + SMALLWEB("modern", SearchSetIdentifier.SMALLWEB), + BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS), + NO_FILTER("corpo", SearchSetIdentifier.NONE), + VINTAGE("vintage", SearchSetIdentifier.NONE), + TILDE("tilde", SearchSetIdentifier.NONE), + CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE), + ACADEMIA("academia", SearchSetIdentifier.NONE), + PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE), + FOOD("food", SearchSetIdentifier.POPULAR), + FORUM("forum", SearchSetIdentifier.NONE), + WIKI("wiki", SearchSetIdentifier.NONE), + DOCS("docs", SearchSetIdentifier.NONE), + ; + + + public final String filterId; + public final SearchSetIdentifier searchSetIdentifier; + + SearchProfile(String filterId, SearchSetIdentifier searchSetIdentifier) { + this.filterId = filterId; + this.searchSetIdentifier = searchSetIdentifier; + } + + private final static SearchProfile[] values = values(); + public static SearchProfile getSearchProfile(String param) { + if (null == param) { + return NO_FILTER; + } + + for (var profile : values) { + if (Objects.equals(profile.filterId, param)) { + return profile; + } + } + + return NO_FILTER; + } + + public void addTacitTerms(SearchQuery subquery) { + if (this == ACADEMIA) { + subquery.searchTermsAdvice.add("special:academia"); + } + if (this == VINTAGE) { + subquery.searchTermsPriority.add("format:html123"); + subquery.searchTermsPriority.add("js:false"); + } + if (this == TILDE) { + subquery.searchTermsAdvice.add("special:tilde"); + } + if (this == PLAIN_TEXT) { + subquery.searchTermsAdvice.add("format:plain"); + } + if (this == WIKI) { + subquery.searchTermsAdvice.add("generator:wiki"); + } + if (this == FORUM) { + subquery.searchTermsAdvice.add("generator:forum"); + } + if (this == DOCS) { + subquery.searchTermsAdvice.add("generator:docs"); + } + if (this == FOOD) { + subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword()); + subquery.searchTermsExclude.add("special:ads"); + } + } + + public SpecificationLimit getYearLimit() { + if (this == SMALLWEB) { + return SpecificationLimit.greaterThan(2015); + } + if (this == VINTAGE) { + return SpecificationLimit.lessThan(2003); + } + else return SpecificationLimit.none(); + } + + public SpecificationLimit getSizeLimit() { + if (this == SMALLWEB) { + return SpecificationLimit.lessThan(500); + } + else return SpecificationLimit.none(); + } + + + public SpecificationLimit getQualityLimit() { + if (this == SMALLWEB) { + return SpecificationLimit.lessThan(5); + } + else return SpecificationLimit.none(); + } + +} + diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/model/UrlDetails.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/UrlDetails.java new file mode 100644 index 00000000..1426f9dc --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/model/UrlDetails.java @@ -0,0 +1,293 @@ +package nu.marginalia.search.model; + +import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawl.HtmlFeature; + +import java.util.ArrayList; +import java.util.List; + +/** + * A class to hold details about a single search result. + */ +public class UrlDetails implements Comparable { + public long id; + public int domainId; + + public EdgeUrl url; + public String title; + public String description; + + public String format; + public int features; + + public DomainIndexingState domainState; + + public double termScore; + + public int resultsFromSameDomain; + + public String positions; + public int positionsCount; + public SearchResultItem resultItem; + public List keywordScores; + + public UrlDetails(long id, int domainId, EdgeUrl url, String title, String description, String format, int features, DomainIndexingState domainState, double termScore, int resultsFromSameDomain, String positions, int positionsCount, SearchResultItem resultItem, List keywordScores) { + this.id = id; + this.domainId = domainId; + this.url = url; + this.title = title; + this.description = description; + this.format = format; + this.features = features; + this.domainState = domainState; + this.termScore = termScore; + this.resultsFromSameDomain = resultsFromSameDomain; + this.positions = positions; + this.positionsCount = positionsCount; + this.resultItem = resultItem; + this.keywordScores = keywordScores; + } + + public UrlDetails() { + } + + public boolean hasMoreResults() { + return resultsFromSameDomain > 1; + } + + public String getFormat() { + if (null == format) { + return "?"; + } + switch (format) { + case "HTML123": + return "HTML 1-3"; + case "HTML4": + return "HTML 4"; + case "XHTML": + return "XHTML"; + case "HTML5": + return "HTML 5"; + case "PLAIN": + return "Plain Text"; + default: + return "?"; + } + } + + public int hashCode() { + return Long.hashCode(id); + } + + @Override + public int compareTo(UrlDetails other) { + int result = Double.compare(getTermScore(), other.getTermScore()); + if (result == 0) result = Long.compare(getId(), other.getId()); + return result; + } + + public boolean equals(Object other) { + if (other == null) { + return false; + } + if (other == this) { + return true; + } + if (other instanceof UrlDetails) { + return ((UrlDetails) other).id == id; + } + return false; + } + + public String getTitle() { + if (title == null || title.isBlank()) { + return url.toString(); + } + return title; + } + + public boolean isPlainText() { + return "PLAIN".equals(format); + } + + public int getProblemCount() { + int mask = HtmlFeature.JS.getFeatureBit() + | HtmlFeature.COOKIES.getFeatureBit() + | HtmlFeature.TRACKING.getFeatureBit() + | HtmlFeature.AFFILIATE_LINK.getFeatureBit() + | HtmlFeature.TRACKING_ADTECH.getFeatureBit() + | HtmlFeature.ADVERTISEMENT.getFeatureBit(); + + return Integer.bitCount(features & mask); + } + + public List getProblems() { + List problems = new ArrayList<>(); + + if (isScripts()) { + problems.add(new UrlProblem("Js", "The page uses Javascript")); + } + if (isCookies()) { + problems.add(new UrlProblem("Co", "The page uses Cookies")); + } + if (isTracking()) { + problems.add(new UrlProblem("Tr", "The page uses Tracking/Analytics")); + } + if (isAffiliate()) { + problems.add(new UrlProblem("Af", "The page may use Affiliate Linking")); + } + if (isAds()) { + problems.add(new UrlProblem("Ad", "The page uses Ads/Adtech Tracking")); + } + return problems; + + } + + public boolean isScripts() { + return HtmlFeature.hasFeature(features, HtmlFeature.JS); + } + + public boolean isTracking() { + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING); + } + + public boolean isAffiliate() { + return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK); + } + + public boolean isMedia() { + return HtmlFeature.hasFeature(features, HtmlFeature.MEDIA); + } + + public boolean isCookies() { + return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES); + } + + public boolean isAds() { + return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH); + } + + public int getMatchRank() { + if (termScore <= 1) return 1; + if (termScore <= 2) return 2; + if (termScore <= 3) return 3; + if (termScore <= 5) return 5; + + return 10; + } + + public long getId() { + return this.id; + } + + public int getDomainId() { + return this.domainId; + } + + public EdgeUrl getUrl() { + return this.url; + } + + public String getDescription() { + return this.description; + } + + public int getFeatures() { + return this.features; + } + + public DomainIndexingState getDomainState() { + return this.domainState; + } + + public double getTermScore() { + return this.termScore; + } + + public int getResultsFromSameDomain() { + return this.resultsFromSameDomain; + } + + public String getPositions() { + return this.positions; + } + + public int getPositionsCount() { + return this.positionsCount; + } + + public SearchResultItem getResultItem() { + return this.resultItem; + } + + public List getKeywordScores() { + return this.keywordScores; + } + + public UrlDetails withId(long id) { + return this.id == id ? this : new UrlDetails(id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withDomainId(int domainId) { + return this.domainId == domainId ? this : new UrlDetails(this.id, domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withUrl(EdgeUrl url) { + return this.url == url ? this : new UrlDetails(this.id, this.domainId, url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withTitle(String title) { + return this.title == title ? this : new UrlDetails(this.id, this.domainId, this.url, title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withDescription(String description) { + return this.description == description ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withFormat(String format) { + return this.format == format ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withFeatures(int features) { + return this.features == features ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withDomainState(DomainIndexingState domainState) { + return this.domainState == domainState ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withTermScore(double termScore) { + return this.termScore == termScore ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withResultsFromSameDomain(int resultsFromSameDomain) { + return this.resultsFromSameDomain == resultsFromSameDomain ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withPositions(String positions) { + return this.positions == positions ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, positions, this.positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withPositionsCount(int positionsCount) { + return this.positionsCount == positionsCount ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, positionsCount, this.resultItem, this.keywordScores); + } + + public UrlDetails withResultItem(SearchResultItem resultItem) { + return this.resultItem == resultItem ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, resultItem, this.keywordScores); + } + + public UrlDetails withKeywordScores(List keywordScores) { + return this.keywordScores == keywordScores ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, keywordScores); + } + + public String toString() { + return "UrlDetails(id=" + this.getId() + ", domainId=" + this.getDomainId() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", domainState=" + this.getDomainState() + ", termScore=" + this.getTermScore() + ", resultsFromSameDomain=" + this.getResultsFromSameDomain() + ", positions=" + this.getPositions() + ", positionsCount=" + this.getPositionsCount() + ", resultItem=" + this.getResultItem() + ", keywordScores=" + this.getKeywordScores() + ")"; + } + + public static record UrlProblem(String name, String description) { + + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/results/BrowseResultCleaner.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/results/BrowseResultCleaner.java new file mode 100644 index 00000000..8f4e5644 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/results/BrowseResultCleaner.java @@ -0,0 +1,27 @@ +package nu.marginalia.search.results; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.screenshot.ScreenshotService; + +import java.util.HashSet; +import java.util.Set; +import java.util.function.Predicate; + +@Singleton +public class BrowseResultCleaner { + private final ScreenshotService screenshotService; + + @Inject + public BrowseResultCleaner(ScreenshotService screenshotService) { + this.screenshotService = screenshotService; + } + + public Predicate shouldRemoveResultPredicateBr() { + Set domainHashes = new HashSet<>(100); + + return (res) -> !screenshotService.hasScreenshot(res.domainId()) + || !domainHashes.add(res.domainHash()); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/results/UrlDeduplicator.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/results/UrlDeduplicator.java new file mode 100644 index 00000000..046b779e --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/results/UrlDeduplicator.java @@ -0,0 +1,69 @@ +package nu.marginalia.search.results; + +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.map.hash.TObjectIntHashMap; +import gnu.trove.set.hash.TIntHashSet; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; +import nu.marginalia.lsh.EasyLSH; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Objects; + +public class UrlDeduplicator { + private final int LSH_SIMILARITY_THRESHOLD = 2; + private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class); + + private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200); + private final TLongList seehLSHList = new TLongArrayList(200); + private final TObjectIntHashMap keyCount = new TObjectIntHashMap<>(200, 0.75f, 0); + + private final int resultsPerKey; + public UrlDeduplicator(int resultsPerKey) { + this.resultsPerKey = resultsPerKey; + } + + public boolean shouldRemove(DecoratedSearchResultItem details) { + if (!deduplicateOnSuperficialHash(details)) + return true; + if (!deduplicateOnLSH(details)) + return true; + if (!limitResultsPerDomain(details)) + return true; + + return false; + } + + public boolean shouldRetain(DecoratedSearchResultItem details) { + return !shouldRemove(details); + } + + private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) { + return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title)); + } + + private boolean deduplicateOnLSH(DecoratedSearchResultItem details) { + long thisHash = details.dataHash; + + if (0 == thisHash) + return true; + + if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD)) + { + seehLSHList.add(thisHash); + return true; + } + + return false; + + } + + private boolean limitResultsPerDomain(DecoratedSearchResultItem details) { + final var domain = details.getUrl().getDomain(); + final String key = domain.getDomainKey(); + + return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey; + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java new file mode 100644 index 00000000..a5f080bf --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchAddToCrawlQueueService.java @@ -0,0 +1,69 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.db.DbDomainQueries; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; +import spark.Spark; + +import java.sql.SQLException; + +public class SearchAddToCrawlQueueService { + + private final DbDomainQueries domainQueries; + private final WebsiteUrl websiteUrl; + private final HikariDataSource dataSource; + private final Logger logger = LoggerFactory.getLogger(SearchAddToCrawlQueueService.class); + + @Inject + public SearchAddToCrawlQueueService(DbDomainQueries domainQueries, + WebsiteUrl websiteUrl, + HikariDataSource dataSource) { + this.domainQueries = domainQueries; + this.websiteUrl = websiteUrl; + this.dataSource = dataSource; + } + + public Object suggestCrawling(Request request, Response response) throws SQLException { + logger.info("{}", request.queryParams()); + int id = Integer.parseInt(request.queryParams("id")); + boolean nomisclick = "on".equals(request.queryParams("nomisclick")); + + String domainName = getDomainName(id); + + if (nomisclick) { + logger.info("Adding {} to crawl queue", domainName); + addToCrawlQueue(id); + } + else { + logger.info("Nomisclick not set, not adding {} to crawl queue", domainName); + } + + response.redirect(websiteUrl.withPath("/site/" + domainName)); + + return ""; + } + + private void addToCrawlQueue(int id) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE) + SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=? + """)) { + stmt.setInt(1, id); + stmt.executeUpdate(); + } + } + + private String getDomainName(int id) { + var domain = domainQueries.getDomain(id); + if (domain.isEmpty()) + Spark.halt(404); + return domain.get().toString(); + } +} + diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchBrowseService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchBrowseService.java new file mode 100644 index 00000000..11c2e0e8 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchBrowseService.java @@ -0,0 +1,87 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.api.domains.DomainInfoClient; +import nu.marginalia.api.domains.model.SimilarDomain; +import nu.marginalia.browse.DbBrowseDomainsRandom; +import nu.marginalia.browse.model.BrowseResult; +import nu.marginalia.browse.model.BrowseResultSet; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.db.DomainBlacklist; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.search.results.BrowseResultCleaner; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import static java.util.Collections.shuffle; + +public class SearchBrowseService { + private final DbBrowseDomainsRandom randomDomains; + private final DbDomainQueries domainQueries; + private final DomainBlacklist blacklist; + private final DomainInfoClient domainInfoClient; + private final BrowseResultCleaner browseResultCleaner; + + @Inject + public SearchBrowseService(DbBrowseDomainsRandom randomDomains, + DbDomainQueries domainQueries, + DomainBlacklist blacklist, + DomainInfoClient domainInfoClient, + BrowseResultCleaner browseResultCleaner) + { + this.randomDomains = randomDomains; + this.domainQueries = domainQueries; + this.blacklist = blacklist; + this.domainInfoClient = domainInfoClient; + this.browseResultCleaner = browseResultCleaner; + } + + public BrowseResultSet getRandomEntries(int set) { + List results = randomDomains.getRandomDomains(25, blacklist, set); + + results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr()); + + return new BrowseResultSet(results); + } + + public BrowseResultSet getRelatedEntries(String domainName) throws ExecutionException, InterruptedException, TimeoutException { + var domain = domainQueries.getDomainId(new EdgeDomain(domainName)); + + var neighbors = domainInfoClient.similarDomains(domain, 50) + .get(100, TimeUnit.MILLISECONDS); + + neighbors.removeIf(sd -> !sd.screenshot()); + + // If the results are very few, supplement with the alternative shitty algorithm + if (neighbors.size() < 25) { + Set allNeighbors = new HashSet<>(neighbors); + allNeighbors.addAll(domainInfoClient + .linkedDomains(domain, 50) + .get(100, TimeUnit.MILLISECONDS) + ); + + neighbors.clear(); + neighbors.addAll(allNeighbors); + neighbors.removeIf(sd -> !sd.screenshot()); + } + + List results = new ArrayList<>(neighbors.size()); + for (SimilarDomain sd : neighbors) { + var resultDomain = domainQueries.getDomain(sd.domainId()); + if (resultDomain.isEmpty()) + continue; + + results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot())); + } + // shuffle the items for a less repetitive experience + shuffle(neighbors); + + return new BrowseResultSet(results, domainName); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchCrosstalkService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchCrosstalkService.java new file mode 100644 index 00000000..ddce56ac --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchCrosstalkService.java @@ -0,0 +1,69 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.search.SearchOperator; +import nu.marginalia.search.model.UrlDetails; +import org.apache.commons.lang3.StringUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.List; + +public class SearchCrosstalkService { + private static final Logger logger = LoggerFactory.getLogger(SearchCrosstalkService.class); + private final SearchOperator searchOperator; + private final MustacheRenderer renderer; + + @Inject + public SearchCrosstalkService(SearchOperator searchOperator, + RendererFactory rendererFactory) throws IOException + { + this.searchOperator = searchOperator; + this.renderer = rendererFactory.renderer("search/site-info/site-crosstalk"); + } + + public Object handle(Request request, Response response) throws SQLException { + String domains = request.queryParams("domains"); + String[] parts = StringUtils.split(domains, ','); + + if (parts.length != 2) { + throw new IllegalArgumentException("Expected exactly two domains"); + } + + response.type("text/html"); + + for (int i = 0; i < parts.length; i++) { + parts[i] = parts[i].trim(); + } + + var resAtoB = searchOperator.doLinkSearch(parts[0], parts[1]); + var resBtoA = searchOperator.doLinkSearch(parts[1], parts[0]); + + var model = new CrosstalkResult(parts[0], parts[1], resAtoB, resBtoA); + + return renderer.render(model); + } + + + + private record CrosstalkResult(String domainA, + String domainB, + List forward, + List backward) + { + + public boolean isFocusDomain() { + return true; // Hack to get the search result templates behave well + } + public boolean hasBoth() { + return !forward.isEmpty() && !backward.isEmpty(); + } + + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchErrorPageService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchErrorPageService.java new file mode 100644 index 00000000..346506e7 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchErrorPageService.java @@ -0,0 +1,47 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.index.api.IndexMqClient; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; +import java.util.Map; + +public class SearchErrorPageService { + private final IndexMqClient indexMqClient; + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final MustacheRenderer renderer; + + @Inject + public SearchErrorPageService(IndexMqClient indexMqClient, + RendererFactory rendererFactory) throws IOException { + + renderer = rendererFactory.renderer("search/error-page-search"); + + this.indexMqClient = indexMqClient; + } + + public void serveError(Request request, Response rsp) { + rsp.body(renderError(request, "Internal error", + """ + An error occurred when communicating with the search engine index. +

+ This is hopefully a temporary state of affairs. It may be due to + an upgrade. The index typically takes a about two or three minutes + to reload from a cold restart. Thanks for your patience. + """)); + } + + private String renderError(Request request, String title, String message) { + return renderer.render(Map.of("title", title, "message", message, + "profile", request.queryParamOrDefault("profile", ""), + "js", request.queryParamOrDefault("js", ""), + "query", request.queryParamOrDefault("query", "") + )); + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFlagSiteService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFlagSiteService.java new file mode 100644 index 00000000..c7ccfa34 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFlagSiteService.java @@ -0,0 +1,85 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import com.zaxxer.hikari.HikariDataSource; + +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +/** Service for handling flagging sites. This code has an admin-facing correspondent in + * DomainComplaintService in control-service + */ +public class SearchFlagSiteService { + private final HikariDataSource dataSource; + + private final CategoryItem unknownCategory = new CategoryItem("unknown", "Unknown"); + + private final List categories = + List.of( + new CategoryItem("spam", "Spam"), + new CategoryItem("freebooting", "Reposting Stolen Content"), + new CategoryItem("broken", "Broken Website"), + new CategoryItem("shock", "Shocking/Offensive"), + new CategoryItem("blacklist", "Review Blacklisting"), + new CategoryItem("no-random", "Remove from Random Exploration") + ); + + private final Map categoryItemMap = + categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity())); + @Inject + public SearchFlagSiteService(HikariDataSource dataSource) { + this.dataSource = dataSource; + } + + public List getCategories() { + return categories; + } + + public List getExistingComplaints(int id) throws SQLException { + try (var conn = dataSource.getConnection(); + var complaintsStmt = conn.prepareStatement(""" + SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION + FROM DOMAIN_COMPLAINT + WHERE DOMAIN_ID=? + """)) + { + List complaints = new ArrayList<>(); + + complaintsStmt.setInt(1, id); + ResultSet rs = complaintsStmt.executeQuery(); + + while (rs.next()) { + complaints.add(new FlagSiteComplaintModel( + categoryItemMap.getOrDefault(rs.getString(1), unknownCategory).categoryDesc, + rs.getString(2), + rs.getBoolean(3), + rs.getString(4))); + } + + return complaints; + } + } + + public void insertComplaint(FlagSiteFormData formData) throws SQLException { + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement( + """ + INSERT INTO DOMAIN_COMPLAINT(DOMAIN_ID, CATEGORY, DESCRIPTION, SAMPLE) VALUES (?, ?, ?, ?) + """)) { + stmt.setInt(1, formData.domainId); + stmt.setString(2, formData.category); + stmt.setString(3, formData.description); + stmt.setString(4, formData.sampleQuery); + stmt.executeUpdate(); + } + } + + public record CategoryItem(String categoryName, String categoryDesc) {} + public record FlagSiteComplaintModel(String category, String submitTime, boolean isReviewed, String decision) {} + public record FlagSiteFormData(int domainId, String category, String description, String sampleQuery) {} +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFrontPageService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFrontPageService.java new file mode 100644 index 00000000..8ebd9f8f --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchFrontPageService.java @@ -0,0 +1,117 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.search.svc.SearchQueryCountService; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; +import java.sql.SQLException; +import java.time.LocalDate; +import java.time.ZoneId; +import java.time.ZonedDateTime; +import java.time.format.DateTimeFormatter; +import java.util.ArrayList; +import java.util.List; + +/** Renders the front page (index) */ +@Singleton +public class SearchFrontPageService { + + private final MustacheRenderer template; + private final HikariDataSource dataSource; + private final SearchQueryCountService searchVisitorCount; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public SearchFrontPageService(RendererFactory rendererFactory, + HikariDataSource dataSource, + SearchQueryCountService searchVisitorCount + ) throws IOException { + this.template = rendererFactory.renderer("search/index/index"); + this.dataSource = dataSource; + this.searchVisitorCount = searchVisitorCount; + } + + public String render(Request request, Response response) { + response.header("Cache-control", "public,max-age=3600"); + + return template.render(new IndexModel( + getNewsItems(), + searchVisitorCount.getQueriesPerMinute() + )); + } + + + private List getNewsItems() { + List items = new ArrayList<>(); + + try (var conn = dataSource.getConnection(); + var stmt = conn.prepareStatement(""" + SELECT TITLE, LINK, SOURCE, LIST_DATE FROM SEARCH_NEWS_FEED ORDER BY LIST_DATE DESC + """)) { + + var rep = stmt.executeQuery(); + + while (rep.next()) { + items.add(new NewsItem( + rep.getString(1), + rep.getString(2), + rep.getString(3), + rep.getDate(4).toLocalDate())); + } + } + catch (SQLException ex) { + logger.warn("Failed to fetch news items", ex); + } + + return items; + } + + public Object renderNewsFeed(Request request, Response response) { + List newsItems = getNewsItems(); + + StringBuilder sb = new StringBuilder(); + + sb.append(""" + + + + Marginalia Search News and Mentions + https://search.marginalia.nu/ + News and Mentions of Marginalia Search + en-us + 60 + """); + + sb.append("").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("\n"); + sb.append("").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("\n"); + sb.append("60\n"); + for (var item : newsItems) { + sb.append("\n"); + sb.append("").append(item.title()).append("\n"); + sb.append("").append(item.url()).append("\n"); + if (item.source != null) { + sb.append("").append(item.source()).append("\n"); + } + sb.append("").append(item.date().atStartOfDay().atZone(ZoneId.systemDefault()).format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("\n"); + sb.append("\n"); + } + sb.append("\n"); + sb.append("\n"); + + response.type("application/rss+xml"); + + return sb.toString(); + } + + private record IndexModel(List news, int searchPerMinute) { } + private record NewsItem(String title, String url, String source, LocalDate date) {} +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryCountService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryCountService.java new file mode 100644 index 00000000..77afba8a --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryCountService.java @@ -0,0 +1,48 @@ +package nu.marginalia.search.svc; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.inject.Singleton; +import java.time.temporal.ChronoUnit; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +/** Keeps per-minute statistics of queries */ +@Singleton +public class SearchQueryCountService { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final AtomicInteger lastMinuteQueries = new AtomicInteger(); + + private final TimeUnit minute = TimeUnit.of(ChronoUnit.MINUTES); + private volatile int queriesPerMinute; + + public SearchQueryCountService() { + Thread updateThread = new Thread(this::updateQueriesPerMinute, + "SearchVisitorCountService::updateQueriesPerMinute"); + updateThread.setDaemon(true); + updateThread.start(); + } + + /** Retreive the number of queries performed the minute before this one */ + public int getQueriesPerMinute() { + return queriesPerMinute; + } + + /** Update query statistics for presentation */ + public void registerQuery() { + lastMinuteQueries.incrementAndGet(); + } + + private void updateQueriesPerMinute() { + try { + for (;;) { + queriesPerMinute = lastMinuteQueries.getAndSet(0); + minute.sleep(1); + } + } catch (InterruptedException e) { + logger.warn("Query counter thread was interrupted"); + } + + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryIndexService.java new file mode 100644 index 00000000..e69de29b diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryService.java new file mode 100644 index 00000000..0f4648da --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchQueryService.java @@ -0,0 +1,62 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.WebsiteUrl; +import nu.marginalia.search.command.CommandEvaluator; +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.exceptions.RedirectException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +public class SearchQueryService { + + private final WebsiteUrl websiteUrl; + private final SearchErrorPageService errorPageService; + private final CommandEvaluator searchCommandEvaulator; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + @Inject + public SearchQueryService( + WebsiteUrl websiteUrl, + SearchErrorPageService errorPageService, + CommandEvaluator searchCommandEvaulator) { + this.websiteUrl = websiteUrl; + this.errorPageService = errorPageService; + this.searchCommandEvaulator = searchCommandEvaulator; + } + + public Object pathSearch(Request request, Response response) { + try { + return searchCommandEvaulator.eval(response, parseParameters(request)); + } + catch (RedirectException ex) { + response.redirect(ex.newUrl); + } + catch (Exception ex) { + logger.error("Error", ex); + errorPageService.serveError(request, response); + } + + return ""; + } + + private SearchParameters parseParameters(Request request) { + try { + final String queryParam = request.queryParams("query"); + + if (null == queryParam || queryParam.isBlank()) { + throw new RedirectException(websiteUrl.url()); + } + + return new SearchParameters(queryParam.trim(), request); + } + catch (Exception ex) { + // Bots keep sending bad requests, suppress the error otherwise it will + // fill up the logs. + + throw new RedirectException(websiteUrl.url()); + } + } +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchSiteInfoService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchSiteInfoService.java new file mode 100644 index 00000000..f56c3b79 --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchSiteInfoService.java @@ -0,0 +1,416 @@ +package nu.marginalia.search.svc; + +import com.google.inject.Inject; +import nu.marginalia.api.domains.DomainInfoClient; +import nu.marginalia.api.domains.model.DomainInformation; +import nu.marginalia.api.domains.model.SimilarDomain; +import nu.marginalia.api.feeds.FeedsClient; +import nu.marginalia.api.feeds.RpcFeed; +import nu.marginalia.api.feeds.RpcFeedItem; +import nu.marginalia.api.livecapture.LiveCaptureClient; +import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.renderer.MustacheRenderer; +import nu.marginalia.renderer.RendererFactory; +import nu.marginalia.screenshot.ScreenshotService; +import nu.marginalia.search.SearchOperator; +import nu.marginalia.search.model.UrlDetails; +import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import spark.Request; +import spark.Response; + +import java.io.IOException; +import java.sql.SQLException; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.function.Supplier; + +public class SearchSiteInfoService { + private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class); + + private final SearchOperator searchOperator; + private final DomainInfoClient domainInfoClient; + private final SearchFlagSiteService flagSiteService; + private final DbDomainQueries domainQueries; + private final MustacheRenderer renderer; + private final FeedsClient feedsClient; + private final LiveCaptureClient liveCaptureClient; + private final ScreenshotService screenshotService; + + @Inject + public SearchSiteInfoService(SearchOperator searchOperator, + DomainInfoClient domainInfoClient, + RendererFactory rendererFactory, + SearchFlagSiteService flagSiteService, + DbDomainQueries domainQueries, + FeedsClient feedsClient, + LiveCaptureClient liveCaptureClient, + ScreenshotService screenshotService) throws IOException + { + this.searchOperator = searchOperator; + this.domainInfoClient = domainInfoClient; + this.flagSiteService = flagSiteService; + this.domainQueries = domainQueries; + + this.renderer = rendererFactory.renderer("search/site-info/site-info"); + + this.feedsClient = feedsClient; + this.liveCaptureClient = liveCaptureClient; + this.screenshotService = screenshotService; + } + + public Object handle(Request request, Response response) throws SQLException { + String domainName = request.params("site"); + String view = request.queryParamOrDefault("view", "info"); + + if (null == domainName || domainName.isBlank()) { + return null; + } + + var model = switch (view) { + case "links" -> listLinks(domainName); + case "docs" -> listDocs(domainName); + case "info" -> listInfo(domainName); + case "report" -> reportSite(domainName); + default -> listInfo(domainName); + }; + + return renderer.render(model); + } + + public Object handlePost(Request request, Response response) throws SQLException { + String domainName = request.params("site"); + String view = request.queryParamOrDefault("view", "info"); + + if (null == domainName || domainName.isBlank()) { + return null; + } + + if (!view.equals("report")) + return null; + + final int domainId = domainQueries.getDomainId(new EdgeDomain(domainName)); + + FlagSiteFormData formData = new FlagSiteFormData( + domainId, + request.queryParams("category"), + request.queryParams("description"), + request.queryParams("sampleQuery") + ); + flagSiteService.insertComplaint(formData); + + var complaints = flagSiteService.getExistingComplaints(domainId); + + var model = new ReportDomain(domainName, domainId, complaints, List.of(), true); + + return renderer.render(model); + } + + private Object reportSite(String domainName) throws SQLException { + int domainId = domainQueries.getDomainId(new EdgeDomain(domainName)); + var existingComplaints = flagSiteService.getExistingComplaints(domainId); + + return new ReportDomain(domainName, + domainId, + existingComplaints, + flagSiteService.getCategories(), + false); + } + + + private Backlinks listLinks(String domainName) { + return new Backlinks(domainName, + domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), + searchOperator.doBacklinkSearch(domainName)); + } + + private SiteInfoWithContext listInfo(String domainName) { + + final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); + + final Future domainInfoFuture; + final Future> similarSetFuture; + final Future> linkingDomainsFuture; + final CompletableFuture feedItemsFuture; + String url = "https://" + domainName + "/"; + + boolean hasScreenshot = screenshotService.hasScreenshot(domainId); + + + if (domainId < 0) { + domainInfoFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID")); + similarSetFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID")); + linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID")); + feedItemsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID")); + } + else if (!domainInfoClient.isAccepting()) { + domainInfoFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable")); + similarSetFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable")); + linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable")); + feedItemsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable")); + } + else { + domainInfoFuture = domainInfoClient.domainInformation(domainId); + similarSetFuture = domainInfoClient.similarDomains(domainId, 25); + linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25); + feedItemsFuture = feedsClient.getFeed(domainId); + } + + List sampleResults = searchOperator.doSiteSearch(domainName, domainId,5); + if (!sampleResults.isEmpty()) { + url = sampleResults.getFirst().url.withPathAndParam("/", null).toString(); + } + + var result = new SiteInfoWithContext(domainName, + domainId, + url, + hasScreenshot, + waitForFuture(domainInfoFuture, () -> createDummySiteInfo(domainName)), + waitForFuture(similarSetFuture, List::of), + waitForFuture(linkingDomainsFuture, List::of), + waitForFuture(feedItemsFuture.thenApply(FeedItems::new), () -> FeedItems.dummyValue(domainName)), + sampleResults + ); + + requestMissingScreenshots(result); + + return result; + } + + /** Request missing screenshots for the given site info */ + private void requestMissingScreenshots(SiteInfoWithContext result) { + + // Always request the main site screenshot, even if we already have it + // as this will make the live-capture do a staleness check and update + // as needed. + liveCaptureClient.requestScreengrab(result.domainId()); + + int requests = 1; + + // Request screenshots for similar and linking domains only if they are absent + // also throttle the requests to at most 5 per view. + + if (result.similar() != null) { + for (var similar : result.similar()) { + if (similar.screenshot()) { + continue; + } + if (++requests > 5) { + break; + } + + liveCaptureClient.requestScreengrab(similar.domainId()); + } + } + + if (result.linking() != null) { + for (var linking : result.linking()) { + if (linking.screenshot()) { + continue; + } + if (++requests > 5) { + break; + } + + liveCaptureClient.requestScreengrab(linking.domainId()); + } + } + + } + + private T waitForFuture(Future future, Supplier fallback) { + try { + return future.get(250, TimeUnit.MILLISECONDS); + } catch (Exception e) { + logger.info("Failed to get domain data: {}", e.getMessage()); + return fallback.get(); + } + } + + private DomainInformation createDummySiteInfo(String domainName) { + return DomainInformation.builder() + .domain(new EdgeDomain(domainName)) + .suggestForCrawling(true) + .unknownDomain(true) + .build(); + } + + private Docs listDocs(String domainName) { + int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1); + return new Docs(domainName, + domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1), + searchOperator.doSiteSearch(domainName, domainId, 100)); + } + + public record Docs(Map view, + String domain, + long domainId, + List results) { + public Docs(String domain, long domainId, List results) { + this(Map.of("docs", true), domain, domainId, results); + } + + public String focusDomain() { return domain; } + + public String query() { return "site:" + domain; } + + public boolean isKnown() { + return domainId > 0; + } + } + + public record Backlinks(Map view, String domain, long domainId, List results) { + public Backlinks(String domain, long domainId, List results) { + this(Map.of("links", true), domain, domainId, results); + } + + public String query() { return "links:" + domain; } + + public boolean isKnown() { + return domainId > 0; + } + } + + public record SiteInfoWithContext(Map view, + Map domainState, + String domain, + int domainId, + String siteUrl, + boolean hasScreenshot, + DomainInformation domainInformation, + List similar, + List linking, + FeedItems feed, + List samples + ) { + public SiteInfoWithContext(String domain, + int domainId, + String siteUrl, + boolean hasScreenshot, + DomainInformation domainInformation, + List similar, + List linking, + FeedItems feedInfo, + List samples + ) + { + this(Map.of("info", true), + Map.of(domainInfoState(domainInformation), true), + domain, + domainId, + siteUrl, + hasScreenshot, + domainInformation, + similar, + linking, + feedInfo, + samples); + } + + public String getLayout() { + // My CSS is too weak to handle this in CSS alone, so I guess we're doing layout in Java... + if (similar != null && similar.size() < 25) { + return "lopsided"; + } + else if (feed != null && !feed.items().isEmpty()) { + return "lopsided"; + } + else if (samples != null && !samples.isEmpty()) { + return "lopsided"; + } + else { + return "balanced"; + } + } + + public String query() { return "site:" + domain; } + + private static String domainInfoState(DomainInformation info) { + if (info.isBlacklisted()) { + return "blacklisted"; + } + if (!info.isUnknownDomain() && info.isSuggestForCrawling()) { + return "suggestForCrawling"; + } + if (info.isInCrawlQueue()) { + return "inCrawlQueue"; + } + if (info.isUnknownDomain()) { + return "unknownDomain"; + } + else { + return "indexed"; + } + } + + public boolean isKnown() { + return domainId > 0; + } + } + + public record FeedItem(String title, String date, String description, String url) { + + public FeedItem(RpcFeedItem rpcFeedItem) { + this(rpcFeedItem.getTitle(), + rpcFeedItem.getDate(), + rpcFeedItem.getDescription(), + rpcFeedItem.getUrl()); + } + + public String pubDay() { // Extract the date from an ISO style date string + if (date.length() > 10) { + return date.substring(0, 10); + } + return date; + } + + public String descriptionSafe() { + return description + .replace("<", "<") + .replace(">", ">"); + } + } + + public record FeedItems(String domain, String feedUrl, String updated, List items) { + + public static FeedItems dummyValue(String domain) { + return new FeedItems(domain, "", "", List.of()); + } + + public FeedItems(RpcFeed rpcFeedItems) { + this(rpcFeedItems.getDomain(), + rpcFeedItems.getFeedUrl(), + rpcFeedItems.getUpdated(), + rpcFeedItems.getItemsList().stream().map(FeedItem::new).toList()); + } + } + + public record ReportDomain( + Map view, + String domain, + int domainId, + List complaints, + List category, + boolean submitted) + { + public ReportDomain(String domain, + int domainId, + List complaints, + List category, + boolean submitted) { + this(Map.of("report", true), domain, domainId, complaints, category, submitted); + } + + public String query() { return "site:" + domain; } + + public boolean isKnown() { + return domainId > 0; + } + } + +} diff --git a/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchUnitConversionService.java b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchUnitConversionService.java new file mode 100644 index 00000000..1727878e --- /dev/null +++ b/code/services-application/search-service-legacy/java/nu/marginalia/search/svc/SearchUnitConversionService.java @@ -0,0 +1,73 @@ +package nu.marginalia.search.svc; + +import nu.marginalia.api.math.MathClient; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.CheckForNull; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import java.util.Optional; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +@Singleton +public class SearchUnitConversionService { + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)"); + private final Predicate evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate(); + + private final MathClient mathClient; + + @Inject + public SearchUnitConversionService(MathClient mathClient) { + this.mathClient = mathClient; + } + + public Optional tryConversion(String query) { + var matcher = conversionPattern.matcher(query); + if (!matcher.matches()) + return Optional.empty(); + + String value = matcher.group(1); + String from = matcher.group(3); + String to = matcher.group(4); + + logger.info("{} -> '{}' '{}' '{}'", query, value, from, to); + + try { + var resultFuture = mathClient.unitConversion(value, from, to); + return Optional.of( + resultFuture.get(250, TimeUnit.MILLISECONDS) + ); + } catch (ExecutionException e) { + logger.error("Error in unit conversion", e); + } catch (InterruptedException e) { + logger.error("Interrupted while waiting for unit conversion", e); + } catch (TimeoutException e) { + // Ignore + } + return Optional.empty(); + } + + + public @CheckForNull Future tryEval(String query) { + if (!evalPredicate.test(query)) { + return null; + } + + var expr = query.toLowerCase().trim(); + + if (expr.chars().allMatch(Character::isDigit)) { + return null; + } + + logger.info("eval({})", expr); + + return mathClient.evalMath(expr); + } +} diff --git a/code/services-application/search-service-legacy/readme.md b/code/services-application/search-service-legacy/readme.md new file mode 100644 index 00000000..36923bf5 --- /dev/null +++ b/code/services-application/search-service-legacy/readme.md @@ -0,0 +1,3 @@ +# Search Service + +This is the old search service that search traffic with the old GUI. \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/crawler-ips.txt b/code/services-application/search-service-legacy/resources/static/search/crawler-ips.txt new file mode 100644 index 00000000..09952002 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/crawler-ips.txt @@ -0,0 +1,14 @@ +81.170.128.52 +193.183.0.162 +193.183.0.163 +193.183.0.164 +193.183.0.165 +193.183.0.166 +193.183.0.167 +193.183.0.168 +193.183.0.169 +193.183.0.170 +193.183.0.171 +193.183.0.172 +193.183.0.173 +193.183.0.174 \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/favicon.ico b/code/services-application/search-service-legacy/resources/static/search/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..a1136a7f6ef20e65517dcc7f46705e46e7fff3a2 GIT binary patch literal 1211 zcmV;s1VsCZP)EX>4Tx04R}tkv&MmKpe$iQ>7x6f>sc5$WWauh>ALD6^c+H)C#RSm|Xe=O&XFG z7e~Rh;NZt%)xpJCR|i)?5c~jfb8}L3krMxx6k5c1aNLh~_a1le0HI!Dn$f_we!cF3PjK&;2=i)U3q-pGZ8*46{PKK|Hlt zF*xrNhgm^ZiO-2gO}ZfQBi9v|-#F(T7IZL1yduQB#x+>PWeLG zWtH<5XRTCa&3p0}2DAFgGS_JiA&x~XL4pVcRTNP|1yNdcQY<8CKjz^dbo>&z6mk{8 z$gzMjG{}x0{11M2Yvm@!-K1a)=zOv5k6|FN3p8rB{e5iPjT6BC3|#3gf4L6Ke3D*k zX^|r!v<+Nbw=`uBxZD8-o($QP9m!8q$mM|dGy0|s(02=TuerT7_i_3Fq^PUJ4RCM> zjN~bM-Q(R|?Y;ebrrF;Qgrah;V>MW400006VoOIv00000008+zyMF)x010qNS#tmY zE+YT{E+YYWr9XB6000McNlirueSad^gZEa<4bO1wgWnpw> zWFU8GbZ8()Nlj2!fese{00NUqL_t(o!|j*NZqrZ@g}<2_+i_Z?l)ehE=%OM8u|h)p zJ_iWVP1U{ut<=|G(JHV41c=AzGf<(Xh5Y#X4vV->LR?gmLN`6qY@tJduujTlp z$7MpfWL!0H=?&DXF9!g;_tfKb?5a$xPr~p%Bg-<9B)u30mYPjV8&O#ci?yPXimKc4 z^0Mvh>|E>!jyqj?U0+mQl&`3aqoX6t%(k|+Xf%>}MxX`FhB!F<{*8l!11t=wAE@HY zF!4mWV#Rxi2I9z(#Mi9*vnw^{@VU(|!e+CHSzvF!&E9^ShItZl>LxffoDLgpweH}Y zV`F23-Q8XD-s7D6H!gUy_pU4ri|W7gN^x8vtlzHbFf=E&-P- z$PWM#um2{B#e^VL2{`imYU}Rk!Kz?>)*P6IGXWW@|970S^7Zq_QCrDeK~%*+gyE+~ za2C4V{>6^KB>&5#82UbnYV+m5C=UY0pGPnb?=uh~%LZec&yfQGMJhH^Q$q21UTxhi zbr8q##b}@;PO>xdVdRW)5ehL>x@JZVWUAEU_LU&p`Ya6UL^dGK z6|X{`=Ojsj_e0<3p+^W|9v6g?Naq}BnsU-P!Cc^aFCqy!Y7Vag1e_DRACzi$nqXuQ zbS#Q>HU`pq{rt8QoBEVC8jaaU(Chc*luA`cmibDyhKCNpDy*)m { + if (e.key === "Enter") { + const form = document.getElementById('search-form'); + form.submit(); + e.preventDefault(); + } +}); diff --git a/code/services-application/search-service-legacy/resources/static/search/menu.js b/code/services-application/search-service-legacy/resources/static/search/menu.js new file mode 100644 index 00000000..c257754a --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/menu.js @@ -0,0 +1,91 @@ +function hideMenu() { + document.getElementById('filters').style.display = 'none'; +} +function showMenu() { + document.getElementById('filters').style.display = 'block'; + + // Defer creation of the close button until the menu is opened. This is needed because the script for creating + // the filter button is run early to avoid layout shifts. + + if (document.getElementById('menu-close') === null) { + registerCloseButton(); + } + + document.getElementById('filters').style.display = 'block'; + + // scroll to the top of the page so the user can see the filters + window.scrollTo({ + top: 0, + left: 0, + behavior: "instant", + }); +} + +const registerCloseButton = () => { + // Add a button to close the filters for mobile; we do this in js to not pollute the DOM for text-only browsers + const closeButton = document.createElement('button'); + closeButton.setAttribute('id', 'menu-close'); + closeButton.setAttribute('title', 'Close the menu'); + closeButton.setAttribute('aria-controls', '#filters'); + closeButton.innerHTML = 'X'; + closeButton.onclick = (event) => { + hideMenu(); + event.stopPropagation(); + return false; + } + document.getElementById('filters').getElementsByTagName('h2')[0].append(closeButton); +} + +// Add a button to open the filters for mobile; we do this in js to not pollute the DOM for text-only browsers +const filtersButton = document.createElement('button'); +filtersButton.setAttribute('id', 'mcfeast'); +filtersButton.setAttribute('aria-controls', '#filters'); +filtersButton.innerHTML = 'Ξ'; +filtersButton.setAttribute('title', 'Open the filters menu'); +filtersButton.onclick = (event) => { + showMenu(); + event.stopPropagation(); + return false; +} + +document.getElementById('search-box').getElementsByTagName('h1')[0].append(filtersButton); + +// swipe affordances for mobile +if (window.matchMedia('(pointer: coarse)').matches) { + // capture swipes to the left and right to open and close the filters + let touchStartX = 0; + let touchEndX = 0; + let touchStartY = 0; + let touchEndY = 0; + + const swipeThreshold = 100; + const maxVerticalDistance = 75; + document.addEventListener('touchstart', (event) => { + touchStartX = event.changedTouches[0].screenX; + touchStartY = event.changedTouches[0].screenY; + }); + document.addEventListener('touchend', (event) => { + touchEndX = event.changedTouches[0].screenX; + touchEndY = event.changedTouches[0].screenY; + let verticalDistance = Math.abs(touchStartY - touchEndY); + + if (verticalDistance > maxVerticalDistance) { + return; + } + + if (touchEndX - touchStartX > swipeThreshold) { + hideMenu(); + event.stopPropagation(); + } else if (touchStartX - touchEndX > swipeThreshold) { + showMenu(); + event.stopPropagation(); + } + }); + + + // Add a floating panel to the bottom of the page to show a message when the filters are hidden + const floatingPanel = document.createElement('div'); + floatingPanel.setAttribute('style', 'position: fixed; bottom: 0; left: 0; right: 0; backdrop-filter: blur(10px); padding: 0.25em; text-align: center; display: block; border-top: 1px solid #ccc; box-shadow: 0 0 -5px #eee;'); + floatingPanel.innerHTML = '← swipe left to open filters ←'; + document.body.appendChild(floatingPanel); +} \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/opensearch.xml b/code/services-application/search-service-legacy/resources/static/search/opensearch.xml new file mode 100644 index 00000000..7b6d14a3 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/opensearch.xml @@ -0,0 +1,15 @@ + + + + + Marginalia + Search Marginalia + UTF-8 + https://search.marginalia.nu/favicon.ico + + https://search.marginalia.nu/ + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/robots.txt b/code/services-application/search-service-legacy/resources/static/search/robots.txt new file mode 100644 index 00000000..0c0833e9 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/robots.txt @@ -0,0 +1,8 @@ +User-agent: * +Disallow: /browse/ +Disallow: /search/ +Disallow: /search +Disallow: /wiki/ +Disallow: /explore/ +Disallow: /site/ +Disallow: /links/ \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/rss.svg b/code/services-application/search-service-legacy/resources/static/search/rss.svg new file mode 100644 index 00000000..2c01c8b3 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/rss.svg @@ -0,0 +1,17 @@ + + + + + + + + + + + + + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/serp.scss b/code/services-application/search-service-legacy/resources/static/search/serp.scss new file mode 100644 index 00000000..ea4adcd0 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/serp.scss @@ -0,0 +1,831 @@ +:root { + color-scheme: light; + + --clr-bg-page: hsl(60, 42%, 95%); // $nicotine-light + + --clr-bg-ui: hsl(0, 0%, 100%); + --clr-text-ui: #000; // $fg-dark + + --clr-bg-theme: hsl(200, 28%, 34%); // $highlight-light + --clr-text-theme: #fff; // $fg-light + + --clr-bg-highlight: hsl(0, 0%, 93%); // $highlight-light2 + --clr-text-highlight: #111111; + + --clr-bg-accent: hsl(63, 19%, 61%); // $nicotine-dark + --clr-border-accent: hsl(63, 19%, 35%); + + --clr-border: #aaa; // $border-color2 + + --clr-shadow: var(--clr-border); + + --clr-link: #0066cc; + --clr-link-visited: #531a89; + --clr-heading-link-visited: #fcc; // $visited + + --font-family: sans-serif; + --font-size: 14px; + --font-family-heading: serif; // $heading-fonts +} + + +@mixin dark-theme-mixin { + color-scheme: dark; + + --clr-bg-page: hsl(0, 0%, 6%); + + --clr-bg-ui: hsl(0, 0%, 18%); + --clr-text-ui: #ddd; + + --clr-bg-theme: hsl(0, 0%, 2%); + --clr-text-theme: var(--clr-text-ui); + + --clr-bg-highlight: hsl(0, 0%, 11%); + --clr-text-highlight: #fff; + + --clr-bg-accent: hsl(200, 32%, 28%); + --clr-border-accent: hsl(200, 8%, 12%); + + --clr-border: hsl(0, 0%, 30%); + + --clr-shadow: #000; + + --clr-link: #8a8aff; + --clr-link-visited: #ffadff; + --clr-heading-link-visited: var(--clr-link-visited); +} + +:root[data-theme='dark'] { + @include dark-theme-mixin; +} + +// Makes theme match the user's OS preference when JS is disabled +@media (prefers-color-scheme: dark) { + :root:not([data-has-js="true"]) { + @include dark-theme-mixin; + } +} + +* { + box-sizing: border-box; +} + +a { + color: var(--clr-link); +} + +a:visited { + color: var(--clr-link-visited); +} + +input, textarea, select { + color: inherit; +} + +h1 a, h2 a { + color: var(--clr-text-theme); +} +h1 a:visited, h2 a:visited { + color: var(--clr-heading-link-visited); +} +progress { + width: 10ch; +} + +body { + background-color: var(--clr-bg-page); + color: var(--clr-text-ui); + font-family: var(--font-family); + font-size: var(--font-size); + line-height: 1.6; + margin-left: auto; + margin-right: auto; + max-width: 120ch; + padding: 0; +} + +#frontpage { + display: grid; + grid-template-columns: 1fr auto; + grid-template-rows: auto 1fr; + grid-gap: 1ch; + align-items: start; + justify-content: start; + margin-top: 1ch; + margin-bottom: 1ch; + // named grid areas + grid-template-areas: + "frontpage-about frontpage-news" + "frontpage-tips frontpage-news"; + + @media (max-device-width: 624px) { + grid-template-columns: 1fr; + grid-template-rows: auto auto auto; + grid-gap: 1ch; + align-items: start; + justify-content: start; + margin-top: 1ch; + margin-bottom: 1ch; + // named grid areas + grid-template-areas: + "frontpage-about" + "frontpage-tips" + "frontpage-news"; + + * { max-width: unset !important; min-width: unset !important; } + } + + #frontpage-news { + grid-area: frontpage-news; + max-width: 40ch; + @extend .dialog; + } + #frontpage-about { + grid-area: frontpage-about; + min-width: 40ch; + @extend .dialog; + } + #frontpage-tips { + grid-area: frontpage-tips; + min-width: 40ch; + @extend .dialog; + } +} + +#siteinfo-nav { + display: block; + width: 100%; + @extend .dialog; + padding: 0.25ch !important; + margin-top: 1.5ch; + + + ul { + list-style: none; + padding: 0; + margin: 1ch; + + li { + display: inline; + padding: 1ch; + background-color: var(--clr-bg-highlight); + + a { + text-decoration: none; + display: inline-block; + color: var(--clr-text-highlight); + } + } + + li.current { + background-color: var(--clr-bg-theme); + a { + color: var(--clr-text-theme); + } + } + } +} + +.dialog { + border: 1px solid var(--clr-border); + box-shadow: 0 0 1ch var(--clr-shadow); + background-color: var(--clr-bg-ui); + padding: 1ch; + + h2 { + margin: 0; + font-family: sans-serif; + font-weight: normal; + padding: 0.5ch; + font-size: 12pt; + background-color: var(--clr-bg-theme); + color: var(--clr-text-theme); + } +} + +header { + background-color: var(--clr-bg-accent); + border: 1px solid var(--clr-border-accent); + color: var(--clr-text-ui); + box-shadow: 0 0 0.5ch var(--clr-shadow); + margin-bottom: 1ch; + display: flex; + align-items: center; + justify-content: space-between; + + nav { + a { + text-decoration: none; + color: var(--clr-text-ui); + padding: .5ch; + display: inline-block; + } + + a:visited { + color: var(--clr-text-ui); + } + + a.extra { + background: #ccc linear-gradient(45deg, + hsl(0, 100%, 70%) 0%, + hsl(120, 100%, 70%) 50%, + hsl(240, 100%, 70%) 100%); + color: black; + text-shadow: 0 0 0.5ch #fff; + } + + a:hover, a:focus { + background: var(--clr-bg-theme); + color: var(--clr-text-theme); + } + } +} + +#theme { + padding: .5ch; + display: none; + + [data-has-js='true'] & { + display: block; + } +} + +#complaint { + @extend .dialog; + max-width: 60ch; + margin-left: auto; + margin-right: auto; + margin-top: 2ch; + + textarea { + width: 100%; + height: 10ch; + } +} + +#siteinfo { + margin-top: 1ch; + display: flex; + gap: 1ch; + flex-grow: 0.5; + flex-shrink: 0.5; + flex-basis: 10ch 10ch; + flex-direction: row; + flex-wrap: wrap; + align-content: stretch; + align-items: stretch; + justify-content: stretch; + + #index-info, #link-info { + width: 32ch; + @extend .dialog; + } + #screenshot { + @extend .dialog; + } + #screenshot img { + width: 30ch; + height: 22.5ch; + } +} + +.infobox { + h2 { + @extend .heading; + } + + background-color: var(--clr-bg-ui); + padding: 1ch; + margin: 1ch; + border: 1px solid var(--clr-border); + box-shadow: 0 0 1ch var(--clr-shadow); +} + +section.cards { + display: flex; + flex-direction: row; + flex-wrap: wrap; + padding-top: 1ch; + gap: 2ch; + justify-content: flex-start; + + .card { + background-color: var(--clr-bg-ui); + border-left: 1px solid #ecb; + border-top: 1px solid #ecb; + box-shadow: var(--clr-shadow) 0 0 5px; + + h2 { + @extend .heading; + word-break: break-word; + } + + h2 a { + display: block !important; + color: inherit; + text-decoration: none; + } + a:focus img { + filter: sepia(100%); + box-shadow: #444 0px 0px 20px; + } + a:focus:not(.nofocus) { + background-color: black; + color: white; + } + + .description { + padding-left: 1ch; + padding-right: 1ch; + overflow: auto; + -webkit-hyphens: auto; + -moz-hyphens: auto; + -ms-hyphens: auto; + hyphens: auto; + } + + img { + width: 28ch; + height: auto; + } + + .info { + padding-left: 1ch; + padding-right: 1ch; + line-height: 1.6; + } + + [data-theme='dark'] & { + border: 1px solid var(--clr-border); + } + } +} + +.positions { + box-shadow: 0 0 2px var(--clr-shadow); + backdrop-filter: brightness(90%); + color: var(--clr-text-highlight); + padding: 2px; + margin-right: -1ch; + margin-left: 1ch; +} + + +footer { + clear: both; + + padding: 2ch; + margin: 16ch 0 0 0; + + font-size: 12pt; + display: flex; + flex-direction: row; + flex-wrap: wrap; + justify-content: flex-start; + + h1 { + font-weight: normal; + border-bottom: 4px solid var(--clr-bg-theme); + } + + h2 { + font-size: 14pt; + font-weight: normal; + border-bottom: 2px solid var(--clr-bg-theme); + width: 80%; + } + + section { + line-height: 1.5; + flex-basis: 40ch; + flex-grow: 1.1; + + background-color: var(--clr-bg-ui); + border-left: 1px solid var(--clr-border); + box-shadow: -1px -1px 5px var(--clr-shadow); + + padding-left: 1ch; + padding-right: 1ch; + margin-left: 1ch; + padding-bottom: 1ch; + margin-bottom: 1ch; + } +} + +#mcfeast, #menu-close { + display: none; +} + +.shadowbox { + box-shadow: 0 0 1ch var(--clr-shadow); + border: 1px solid var(--clr-border); +} + +.heading { + margin: 0; + padding: 0.5ch; + background-color: var(--clr-bg-theme); + border-bottom: 1px solid var(--clr-border); + font-family: var(--font-family-heading); + font-weight: normal; + color: var(--clr-text-theme); + font-size: 12pt; + word-break: break-word; +} + + +.sidebar-narrow { + display: grid; + grid-template-columns: auto max-content; + grid-gap: 1ch; + align-items: start; +} + +#crosstalk-view { + display: grid; + grid-template-columns: 1fr 1fr; + grid-template-rows: auto 1fr; + grid-gap: 1ch; + align-content: start; + justify-content: start; + align-items: start; +} + +#similar-view { + display: grid; + grid-template-columns: 1fr 1fr; + grid-template-rows: auto 1fr; + grid-gap: 1ch; + align-content: start; + justify-content: start; + align-items: start; + table { + th { + text-align: left; + } + } + .screenshot { + width: 100%; + height: auto; + } +} + +#similar-view[data-layout="lopsided"] { + #similar-info { + @extend .dialog; + grid-column: 1; + grid-row: 1 / span 2; + } + #similar-domains { + @extend .dialog; + grid-column: 2; + grid-row: 1; + } + #similar-links { + @extend .dialog; + grid-row: 2; + grid-column: 2; + } + +} + +#similar-view[data-layout="balanced"] { + #similar-info { + @extend .dialog; + } + #similar-domains { + grid-row: span 2; + @extend .dialog; + } + #similar-links { + @extend .dialog; + } +} + +@media (max-device-width: 900px) { + #similar-view, #crosstalk-view { + display: block; + * { + margin-bottom: 1ch; + } + } +} + +@media (max-device-width: 840px) { + section.cards { + display: block; + .card { + margin-bottom: 2ch; + img { + width: 100% !important; + height: auto; + } + } + } +} + +#search-box { + @extend .shadowbox; + + padding: 0.5ch; + background-color: var(--clr-bg-ui); + display: grid; + grid-template-columns: max-content 0 auto max-content; + grid-gap: 0.5ch; + grid-auto-rows: minmax(1ch, auto); + width: 100%; + + h1 { + margin: 0; + padding: 0.5ch; + font-size: 14pt; + word-break: keep-all; + background-color: var(--clr-bg-theme); + color: var(--clr-text-theme); + font-family: var(--font-family-heading); + font-weight: normal; + text-align: center; + display: flex; + justify-content: space-between; + } + + #suggestions-anchor { + margin: -0.5ch; // We need this anchor for the typeahead suggestions, but we don't want it to affect the layout + padding: 0; + } + + input[type="text"] { + font-family: monospace; + font-size: 12pt; + padding: 0.5ch; + border: 1px solid var(--clr-border); + background-color: inherit; + } + + input[type="submit"] { + font-size: 12pt; + border: 1px solid var(--clr-border); + background-color: var(--clr-bg-ui); + cursor: pointer; + } + + // white suggesitons looks fine in dark mode + .suggestions { + background-color: #fff; + padding: .5ch; + margin-top: 5.5ch; + margin-left: 1ch; + position: absolute; + display: inline-block; + width: 300px; + border-left: 1px solid #ccc; + border-top: 1px solid #ccc; + box-shadow: 5px 5px 5px var(--clr-shadow); + z-index: 10; + + a { + display: block; + color: #000; + font-size: 12pt; + font-family: 'fixedsys', monospace, serif; + text-decoration: none; + outline: none; + } + + a:focus { + display: block; + background-color: #000; + color: #eee; + } + } + +} + +.filter-toggle-on { + a:before { + content: '✓'; + margin-right: 1.5ch; + } +} +.filter-toggle-off { + a:before { + content: '✗'; + margin-right: 1.5ch; + } +} + +#filters { + @extend .shadowbox; + margin-top: 1ch; + background-color: var(--clr-bg-ui); + + h2 { + @extend .heading; + background-color: var(--clr-bg-theme); + } + h3 { + @extend .heading; + background-color: var(--clr-bg-highlight); + color: var(--clr-text-highlight); + font-family: sans-serif; + border-bottom: 1px solid #000; + } + + hr { + border-top: 0.5px solid var(--clr-border); + border-bottom: none; + } + ul { + list-style-type: none; + padding-left: 0; + + li { + padding: 1ch; + a { + color: inherit; + text-decoration: none; + } + a:hover, a:focus { + border-bottom: 1px solid var(--clr-bg-theme); + } + } + + li.current { + border-left: 4px solid var(--clr-bg-theme); + background-color: var(--clr-bg-highlight); + a { + margin-left: -4px; + } + } + } +} + +.search-result { + @extend .shadowbox; + margin: 1ch 0 2ch 0; + + .url { + background-color: var(--clr-bg-theme); + padding-left: 0.5ch; + + a { + word-break: break-all; + font-family: monospace; + font-size: 8pt; + color: var(--clr-text-theme); + text-shadow: 0 0 1ch #000; // guarantee decent contrast across background colors + } + a:visited { + color: var(--clr-heading-link-visited); + } + } + + h2 { + a { + word-break: break-all; + color: var(--clr-text-ui); + text-decoration: none; + } + font-size: 12pt; + @extend .heading; + background-color:var(--clr-bg-highlight); + } + + .description { + background-color: var(--clr-bg-ui); + word-break: break-word; + padding: 1ch; + margin: 0; + } + + ul.additional-results { + background-color: var(--clr-bg-ui); + padding: 1ch; + list-style: none; + margin: 0; + a { + color: inherit; + } + } +} + +.search-result[data-ms-rank="1"] { .url, h2 { filter: grayscale(0%); } } +.search-result[data-ms-rank="2"] { .url, h2 { filter: grayscale(5%); } } +.search-result[data-ms-rank="3"] { .url, h2 { filter: grayscale(15%); } } +.search-result[data-ms-rank="4"] { .url, h2 { filter: grayscale(20%); } } +.search-result[data-ms-rank="5"] { .url, h2 { filter: grayscale(30%); } } +.search-result[data-ms-rank="10"] { .url, h2 { filter: grayscale(60%); } } + +.utils { + display: flex; + font-size: 10pt; + padding: 1ch; + background-color: var(--clr-bg-highlight); + + > * { + margin-right: 1ch; + margin-left: 1ch; + } + .meta { + flex-grow: 2; + text-align: right; + } + .meta > * { + padding-left: 4px; + } + a { + color: var(--clr-text-highlight); + } +} + +@media (max-device-width: 624px) { + [data-has-js="true"] body { // This property is set via js so we can selectively enable these changes only if JS is enabled; + // This is desirable since mobile navigation is JS-driven. If JS is disabled, having a squished + // GUI is better than having no working UI. + margin: 0 !important; + padding: 0 0 0 0 !important; + max-width: 100%; + + #suggestions-anchor { display: none; } // suggestions are not useful on mobile + + .sidebar-narrow { + display: block; // fix for bizarre chrome rendering issue + } + + #mcfeast { + display: inline; + float: right; + width: 2rem; + font-size: 1rem; + } + + #menu-close { + float: right; + display: inline; + } + + #filters { + display: none; + position: absolute; + top: 0; + left: 0; + width: 100%; + margin: 0; + padding: 0; + z-index: 100; + } + + .sidebar-narrow { + grid-template-columns: auto; + } + + #search-box { + grid-template-columns: auto; + } + + #filters { + margin-top: 0; + } + + .search-result { + margin-left: 0; + margin-right: 0; + } + } +} + +.page-link { + padding-top: 0.25ch; + padding-bottom: 0.25ch; + padding-left: 0.5ch; + padding-right: 0.5ch; + margin-right: 0.5ch; + + font-size: 12pt; + border: 1px solid var(--clr-border); + background-color: var(--clr-bg-highlight); + color: var(--clr-text-ui) !important; + text-decoration: none; +} + +.page-link.active { + border: 1px solid var(--clr-text-ui); + background-color: var(--clr-bg-ui); +} + +// The search results page is very confusing on text-based browsers, so we add a hr to separate the search results. This is +// hidden on modern browsers via CSS. + +hr.w3m-helper { display: none; } + +// This is a screenreader-only class that hides content from visual browsers, but allows screenreaders and +// text-based browsers to access it. + +.screenreader-only { + position:absolute; + left:-10000px; + top:auto; + width:1px; + height:1px; + overflow:hidden; +} diff --git a/code/services-application/search-service-legacy/resources/static/search/theme.js b/code/services-application/search-service-legacy/resources/static/search/theme.js new file mode 100644 index 00000000..73fdcd26 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/theme.js @@ -0,0 +1,57 @@ +function getTheme() { + const theme = window.localStorage.getItem('theme'); + + // if a valid theme is set in localStorage, return it + if (theme === 'dark' || theme === 'light') { + return { value: theme, system: false }; + } + + // if matchMedia is supported and OS theme is dark + if (window.matchMedia('(prefers-color-scheme: dark)').matches) { + return { value: 'dark', system: true }; + } + + return { value: 'light', system: true }; +} + +function setTheme(value) { + if (value === 'dark' || value === 'light') { + window.localStorage.setItem('theme', value); + } else { + window.localStorage.removeItem('theme'); + } + + const theme = getTheme(); + + document.documentElement.setAttribute('data-theme', theme.value); +} + +function initializeTheme() { + const themeSelect = document.getElementById('theme-select'); + + const theme = getTheme(); + + document.documentElement.setAttribute('data-theme', theme.value); + + // system is selected by default in the themeSwitcher so ignore it here + if (!theme.system) { + themeSelect.value = theme.value; + } + + themeSelect.addEventListener('change', e => { + setTheme(e.target.value); + }); + + const mql = window.matchMedia('(prefers-color-scheme: dark)'); + + // if someone changes their theme at the OS level we need to update + // their theme immediately if they're using their OS theme + mql.addEventListener('change', e => { + if (themeSelect.value !== 'system') return; + + if (e.matches) setTheme('dark'); + else setTheme('light'); + }); +} + +initializeTheme(); \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/static/search/tts.js b/code/services-application/search-service-legacy/resources/static/search/tts.js new file mode 100644 index 00000000..20ee9f37 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/static/search/tts.js @@ -0,0 +1,112 @@ + +function setupTypeahead() { + const query = document.getElementById('query'); + query.setAttribute('autocomplete', 'off'); + const queryBox = document.getElementById('suggestions-anchor'); + let timer = null; + + function fetchSuggestions(e) { + if (timer != null) { + clearTimeout(timer); + } + timer = setTimeout(() => { + const req = new XMLHttpRequest(); + + req.onload = rsp => { + let items = JSON.parse(req.responseText); + + const old = document.getElementById('suggestions'); + if (old != null) old.remove(); + + + if (items.length === 0) return; + + const suggestions = document.createElement('div'); + suggestions.setAttribute('id', 'suggestions'); + suggestions.setAttribute('class', 'suggestions'); + + for (i=0;i { + if (e.key === "ArrowDown") { + if (e.target.nextElementSibling != null) { + e.target.nextElementSibling.focus(); + } + + e.preventDefault() + } + else if (e.key === "ArrowUp") { + if (e.target.previousElementSibling != null) { + e.target.previousElementSibling.focus(); + } + else { + query.focus(); + } + e.preventDefault() + } + else if (e.key === "Escape") { + var suggestions = document.getElementById('suggestions'); + if (suggestions != null) { + suggestions.remove(); + } + query.focus(); + e.preventDefault(); + } + }); + item.addEventListener('keypress', e=> { + if (e.key === "Enter") { + suggestionClickHandler(e); + } + }); + suggestions.appendChild(item); + } + queryBox.prepend(suggestions); + } + + req.open("GET", "/suggest/?partial="+encodeURIComponent(query.value)); + req.send(); + }, 250); + } + query.addEventListener("input", fetchSuggestions); + query.addEventListener("click", e=> { + const suggestions = document.getElementById('suggestions'); + if (suggestions != null) { + suggestions.remove(); + } + }); + query.addEventListener("keydown", e => { + if (e.key === "ArrowDown") { + const suggestions = document.getElementById('suggestions'); + if (suggestions != null) { + suggestions.childNodes[0].focus(); + } + else { + fetchSuggestions(e); + } + e.preventDefault() + } + else if (e.key === "Escape") { + const suggestions = document.getElementById('suggestions'); + if (suggestions != null) { + suggestions.remove(); + } + query.focus(); + e.preventDefault(); + } + }); +} + +if(!window.matchMedia("(pointer: coarse)").matches) { + setupTypeahead(); +} diff --git a/code/services-application/search-service-legacy/resources/templates/search/browse-result.hdb b/code/services-application/search-service-legacy/resources/templates/search/browse-result.hdb new file mode 100644 index 00000000..fa7a06f1 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/browse-result.hdb @@ -0,0 +1,12 @@ +
+

{{displayDomain}}

+ + + {{displayDomain}} screenshot + + + +
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/browse-results.hdb b/code/services-application/search-service-legacy/resources/templates/search/browse-results.hdb new file mode 100644 index 00000000..0c674c10 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/browse-results.hdb @@ -0,0 +1,34 @@ + + + + + Marginalia Search - {{query}} + + + + + + + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} + +
+{{#if focusDomain}} + Showing domains similar to {{focusDomain}}. +{{/if}} +{{#unless focusDomain}} +This list of domains is random. Refresh to get +new domains, or click Similar Domains to +take the helm. +{{/unless}} +
+ +
+{{#each results}}{{>search/browse-result}}{{/each}} +
+ +{{>search/parts/search-footer}} + diff --git a/code/services-application/search-service-legacy/resources/templates/search/conversion-results.hdb b/code/services-application/search-service-legacy/resources/templates/search/conversion-results.hdb new file mode 100644 index 00000000..6c58b4a4 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/conversion-results.hdb @@ -0,0 +1,23 @@ + + + + + Marginalia Search - {{query}} + + + + + + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} + +
+ {{query}} = {{result}} +
+ + +{{>search/parts/search-footer}} + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/dictionary-results.hdb b/code/services-application/search-service-legacy/resources/templates/search/dictionary-results.hdb new file mode 100644 index 00000000..12a25be9 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/dictionary-results.hdb @@ -0,0 +1,40 @@ + + + + + Marginalia Search - {{query}} + + + + + + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} + +
+{{#unless entries}} +No definitions were found for that word +{{/unless}} + +{{#if entries}} +
    +{{#each entries}} +
  • {{word}}, {{type}}: {{definition}}
  • +{{/each}} +
+{{/if}} +
+ +{{#if entries}} +
+

Legal

+This data is derived from wiktionary, +available under GFDL and CC BY-SA 3.0. More Information. +
+{{/if}} + +{{>search/parts/search-footer}} + diff --git a/code/services-application/search-service-legacy/resources/templates/search/error-page-search.hdb b/code/services-application/search-service-legacy/resources/templates/search/error-page-search.hdb new file mode 100644 index 00000000..a84ef2ab --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/error-page-search.hdb @@ -0,0 +1,24 @@ + + + + + Marginalia Search - {{title}} + + + + + + + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} + +
+

{{ title }}

+
{{{message}}}
+
+ +{{>search/parts/search-footer}} + diff --git a/code/services-application/search-service-legacy/resources/templates/search/error-page.hdb b/code/services-application/search-service-legacy/resources/templates/search/error-page.hdb new file mode 100644 index 00000000..fbef861e --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/error-page.hdb @@ -0,0 +1,20 @@ + + + Error + + + + +
+

Error

+

Oops! It appears the index server is {{indexState}}.

+

The server was probably restarted to bring online some changes. Restarting the index typically takes + a few minutes, during which searches can't be served.

+ +

In the event of a longer outage, the @marginalianu feed + on Twitter may have details, otherwise you can always send me an email at kontakt@marginalia.nu.

+ +

This page will attempt to refresh automatically every few seconds.

+
+ + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/index/index-about.hdb b/code/services-application/search-service-legacy/resources/templates/search/index/index-about.hdb new file mode 100644 index 00000000..7328d2ee --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/index/index-about.hdb @@ -0,0 +1,22 @@ +
+

About

+
+

This is an independent DIY search engine that focuses on non-commercial content, and attempts to + show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew + existed.

+

+ The software for this search engine is all custom-built, and all crawling and indexing is + done in-house. The project is open source. Feel free to poke about in the source code or contribute + to the development! +

+

+ The search engine is currently serving about {{searchPerMinute}} queries/minute. +

+

Consider supporting the + project!

+
+
+ Read More +
+
diff --git a/code/services-application/search-service-legacy/resources/templates/search/index/index-news.hdb b/code/services-application/search-service-legacy/resources/templates/search/index/index-news.hdb new file mode 100644 index 00000000..286c4451 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/index/index-news.hdb @@ -0,0 +1,17 @@ + +{{#if news}} +
+

Publicity, Discussion and Events

+
+
+ {{#each news}} +
{{title}}
+
{{date}} {{source}}
+ {{/each}} +
+
+ +
+{{/if}} \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/index/index-redesign.hdb b/code/services-application/search-service-legacy/resources/templates/search/index/index-redesign.hdb new file mode 100644 index 00000000..04b688c1 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/index/index-redesign.hdb @@ -0,0 +1,14 @@ +
+

Public Beta Available

+
+

+ A redesigned version of the search engine UI is available for beta testing. + Feel free to give it a spin, feedback is welcome! + The old one will also be keep being available if you hate it, + or have compatibility issues. +

+

+ Try it out! +

+
+
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/index/index-tips.hdb b/code/services-application/search-service-legacy/resources/templates/search/index/index-tips.hdb new file mode 100644 index 00000000..c50273d0 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/index/index-tips.hdb @@ -0,0 +1,21 @@ +
+

Tips

+
+

+ This search engine isn't particularly well equipped to answering queries + posed like questions, instead try to imagine some text that might appear + in the website you are looking for, and search for that.

+

+ Where this search engine really shines is finding small, old and obscure websites about some + given topic, perhaps + old video games, + a mystery, + theology, + the occult, + knitting, + computer science, + or art. +

+ +
+
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/index/index.hdb b/code/services-application/search-service-legacy/resources/templates/search/index/index.hdb new file mode 100644 index 00000000..6fb98ef6 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/index/index.hdb @@ -0,0 +1,31 @@ + + + + + Marginalia Search + + + + + + + + + + + + + + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} +
+{{>search/index/index-news}} +{{>search/index/index-about}} +{{>search/index/index-redesign}} +
+ +{{>search/parts/search-footer}} + diff --git a/code/services-application/search-service-legacy/resources/templates/search/parts/search-filters.hdb b/code/services-application/search-service-legacy/resources/templates/search/parts/search-filters.hdb new file mode 100644 index 00000000..efb020cf --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/parts/search-filters.hdb @@ -0,0 +1,46 @@ +

Filters

+
    + {{#with removeJsOption}} + + {{/with}} + {{#with reduceAdtechOption}} + + {{/with}} + {{#with showRecentOption}} + + {{/with}} + {{#with searchTitleOption}} + + {{/with}} +
+

Domains

+
    + {{#each filterGroups}} + {{#each .}} +
  • {{displayName}}
  • + {{/each}} +
    + {{/each}} +
+ + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/parts/search-footer.hdb b/code/services-application/search-service-legacy/resources/templates/search/parts/search-footer.hdb new file mode 100644 index 00000000..747e7dd0 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/parts/search-footer.hdb @@ -0,0 +1,124 @@ +
+
+

Syntax

+ This is a keyword-based search engine. When entering multiple search terms, the search engine will + attempt to match them against documents where the terms occur in close proximity.

+ + Search terms can be excluded with a hyphen.

+ + While the search engine at present does not allow full text search, quotes can be used to + specifically search for names or terms in the title. Using quotes will also cause the search engine + to be as literal as possible in interpreting the query.

+ + Parentheses can be used to add terms to the query without giving weight to the terms when ranking + the search results.

+ +

Samples

+
+
soup -chicken
+
Look for keywords that contain soup, but not + chicken.
+
"keyboard"
+
Look for pages containing the exact word + keyboard, not keyboards or the like.
+
"steve mcqueen"
+
Look for pages containing the exact words steve mcqueen + in that order, with no words in between.
+
apology (plato)
+
Look for pages containing apology and plato, but only rank them + based on their relevance to apology
+
+
+
+

Special Keywords

+ Several special keywords are supported by the search engine. +

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
KeywordMeaning
site:example.comDisplay site information about example.com
site:example.com keywordSearch example.com for keyword
browse:example.comShow similar websites to example.com
ip:127.0.0.1Search documents hosted at 127.0.0.1
links:example.comSearch documents linking to example.com
tld:edu keywordSearch documents with the top level domain edu.
?tld:edu keywordPrefer but do not require results with the top level domain edu. + This syntax is also possible for links:..., ip:... and site:...
q>5The amount of javascript and modern features is at least 5 (on a scale 0 to 25)
q<5The amount of javascript and modern features is at most 5 (on a scale 0 to 25)
year>2005(beta) The document was ostensibly published in or after 2005
year=2005(beta) The document was ostensibly published in 2005
year<2005(beta) The document was ostensibly published in or before 2005
rank>50The ranking of the website is at least 50 in a span of 1 - 255
rank<50The ranking of the website is at most 50 in a span of 1 - 255
count>10 The search term must appear in at least 10 results form the domain
count<10 The search term must appear in at most 10 results from the domain
format:html5Filter documents using the HTML5 standard. This is typically modern websites.
format:xhtmlFilter documents using the XHTML standard
format:html123Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites.
generator:wordpressFilter documents with the specified generator, in this case wordpress
file:zipFilter documents containing a link to a zip file (most file-endings work)
file:audioFilter documents containing a link to an audio file
file:videoFilter documents containing a link to a video file
file:archiveFilter documents containing a link to a compressed archive
file:documentFilter documents containing a link to a document
-special:mediaFilter out documents with audio or video tags
-special:scriptsFilter out documents with javascript
-special:affiliateFilter out documents with likely Amazon affiliate links
-special:trackingFilter out documents with analytics or tracking code
-special:cookiesFilter out documents with cookies
+

+
+

Results Legend

+

+ The estimated relevance of the search result is indicated using the color saturation + of the color of the search result, as well as the order the results are presented. +

+

+ Information about the position of the match is indicated using a dot matrix + in the bottom bar of each search result. Each dot represents four sentences, + and are presented in an order of top-to-bottom, left-to-right. + +

⣿⠃⠀⠀   — The terms occur heavily toward the beginning of the document. +

⠠⠀⡄⠁   — The terms occur sparsely throughout the document. +

⠀⠁⠀⠀   — The terms occur only in a single sentence. +

+

Potentially problems with the document are presented with a warning triangle, e.g. ⚠ 3. + Desktop users can mouse-over this to get a detailed breakdown. +

+ + +
+ + diff --git a/code/services-application/search-service-legacy/resources/templates/search/parts/search-form.hdb b/code/services-application/search-service-legacy/resources/templates/search/parts/search-form.hdb new file mode 100644 index 00000000..82b525e5 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/parts/search-form.hdb @@ -0,0 +1,18 @@ +
+ +
+ + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/parts/search-header.hdb b/code/services-application/search-service-legacy/resources/templates/search/parts/search-header.hdb new file mode 100644 index 00000000..805ea8a9 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/parts/search-header.hdb @@ -0,0 +1,21 @@ + +
+ +
+ + +
+
+ + + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/parts/search-result-rest.hdb b/code/services-application/search-service-legacy/resources/templates/search/parts/search-result-rest.hdb new file mode 100644 index 00000000..d0dd1a4e --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/parts/search-result-rest.hdb @@ -0,0 +1,32 @@ + +
+{{#with first}} + +

{{title}}

+

{{description}}

+ +{{/with}} +
+ Also from {{first.url.domain}} +
+ +{{#with first}} +
+ Info + {{resultsFromSameDomain}}+ +
+ {{#each problems}} + {{name}} + {{/each}} + +
+
+{{/with}} +
+ +
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/parts/search-result.hdb b/code/services-application/search-service-legacy/resources/templates/search/parts/search-result.hdb new file mode 100644 index 00000000..e448fcdb --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/parts/search-result.hdb @@ -0,0 +1,22 @@ + +
+ + +

{{title}}

+

{{description}}

+ +
+ {{#unless focusDomain}} + Info + {{#if hasMoreResults}}{{resultsFromSameDomain}}+{{/if}}{{/unless}} +
+ {{#each problems}} + {{name}} + {{/each}} + +
Terms appear in {{positionsCount}} positions
+
+
+
+
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/search-results.hdb b/code/services-application/search-service-legacy/resources/templates/search/search-results.hdb new file mode 100644 index 00000000..fd4ce717 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/search-results.hdb @@ -0,0 +1,75 @@ + + + + + Marginalia Search - {{query}} + + + + + + + + + +{{#if newFilter}}
Search Filters Updated
{{/if}} + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} + + + + + +{{>search/parts/search-footer}} + + \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-crosstalk.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-crosstalk.hdb new file mode 100644 index 00000000..8d70b42d --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-crosstalk.hdb @@ -0,0 +1,40 @@ + + + + + Marginalia Search - {{domainA}} and {{domainB}} + + + + + + + + +{{>search/parts/search-header}} +{{>search/parts/search-form}} + + + +
+ Showing results containing links between {{domainA}} and {{domainB}}. +
+{{#each tests}}{{.}}{{/each}} +
+
+ {{#each forward}} + {{>search/parts/search-result}} + {{/each}} +
+
+ {{#each backward}} + {{>search/parts/search-result}} + {{/each}} +
+
+ + +{{>search/parts/search-footer}} + + + diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-feed.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-feed.hdb new file mode 100644 index 00000000..c3e52d81 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-feed.hdb @@ -0,0 +1,22 @@ +{{#if feed.items}} +{{#with feed}} +

Feed

+ +
+ {{#each items}} +
{{title}}
+
{{pubDay}}
{{{descriptionSafe}}}
+ {{/each}} +
+{{/with}} +{{/if}} + +{{#unless feed.items}}{{#if samples}} +

Sample

+
+{{#each samples}} +
{{title}}
+
{{description}}
+{{/each}} +
+{{/if}}{{/unless}} \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-blacklisted.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-blacklisted.hdb new file mode 100644 index 00000000..4fc787de --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-blacklisted.hdb @@ -0,0 +1,8 @@ +

This website is blacklisted. This excludes it from crawling and indexing.

+ +

This is usually because of some form of misbehavior on the webmaster's end. + Either annoying search engine spam, or tasteless content bad faith content.

+ +

Occasionally this is done hastily and in error. If you would like the decision + reviewed, you may use the report form to file an appeal. +

\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-indexed.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-indexed.hdb new file mode 100644 index 00000000..d138452b --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-indexed.hdb @@ -0,0 +1,13 @@ +
+ Index + State: {{state}}
+ Domain ID: {{domainId}}
+ Node Affinity: {{nodeAffinity}}
+ Pages Known: {{pagesKnown}}
+ Pages Crawled: {{pagesFetched}}
+ Pages Indexed: {{pagesIndexed}}
+

+ IP: {{ip}} {{#if ipCountry}}{{getIpFlag}}{{/if}}
+ AS: {{#if asn}}{{asn}} {{asnOrg}} {{asnCountry}}{{/if}}
+
+
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-suggest.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-suggest.hdb new file mode 100644 index 00000000..f8ee2119 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-suggest.hdb @@ -0,0 +1,12 @@ +
+
+ Crawling + This website is not queued for crawling. If you would like it to be crawled, + use the checkbox and button below.

+ + +
+
+ +

+
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-unknown.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-unknown.hdb new file mode 100644 index 00000000..00b4d279 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index-unknown.hdb @@ -0,0 +1,9 @@ +
+ Crawling + This website is not known to the search engine. + + To submit the website for crawling, follow these instructions. +
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index.hdb new file mode 100644 index 00000000..43ed2450 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-index.hdb @@ -0,0 +1,23 @@ +

Indexing Information

+{{#if domainState.blacklisted}} + {{>search/site-info/site-info-index-blacklisted}} +{{/if}} + +{{#if domainState.unknownDomain}} + {{>search/site-info/site-info-index-unknown}} +{{/if}} + +{{#if domainState.inCrawlQueue}} +

+This website is in the queue for crawling. +It may take up to a month before it is indexed. +

+{{/if}} + +{{#if domainState.suggestForCrawling}} + {{>search/site-info/site-info-index-suggest}} +{{/if}} + +{{#if domainState.indexed}} + {{>search/site-info/site-info-index-indexed}} +{{/if}} \ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-links.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-links.hdb new file mode 100644 index 00000000..fa869930 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-links.hdb @@ -0,0 +1,7 @@ +

Links

+
+ Link Graph + Ranking: {{ranking}}%
+ Incoming Links: {{incomingLinks}}
+ Outbound Links: {{outboundLinks}}
+
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-report.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-report.hdb new file mode 100644 index 00000000..2b888a3e --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-report.hdb @@ -0,0 +1,60 @@ +
+ {{#if submitted}} +

Your complaint against {{domain}} has been submitted

+

The review process is manual and may take a while. If urgent action is necessary, + reach me at kontakt@marginalia.nu! +

+ {{/if}} + + {{#unless submitted}} +

Flag {{domain}} for review

+

+ Note, this is not intended to police acceptable thoughts or ideas. +

+ That said, offensive content in obvious bad faith is not tolerated, especially when designed + to crop up when you didn't go looking for it. How and where it is said is more + important than what is said. +

+ This form can also be used to appeal unfairly blacklisted sites. +

+ +

+
+ Flag for Review + +
+ +
+
+
+
+
+
+
+
+
+ +
+
+

+ Communicating through forms and tables is a bit impersonal, + you may also reach a human being through email at kontakt@marginalia.nu. + {{/unless}} + + {{#if complaints}} +


+

Complaints against {{domain}}

+ + + {{#each complaints}} + + + + + + {{/each}} +
CategorySubmittedReviewed
{{category}}{{submitTime}}{{#if reviewed}}✓{{/if}}
+ {{/if}} +
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-summary.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-summary.hdb new file mode 100644 index 00000000..d913a0a6 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info-summary.hdb @@ -0,0 +1,124 @@ +
+ A visual exploration mode is also available. +
+ + +
+
+

🌎 {{domain}}

+ + + {{#if hasScreenshot}} + + Screenshot of {{domain}} + + {{/if}} + + {{#unless hasScreenshot}} +

Screenshot not yet available.

+ {{/unless}} + + {{#with domainInformation}} + {{> search/site-info/site-info-feed}} + {{> search/site-info/site-info-index}} + {{> search/site-info/site-info-links}} + {{/with}} +
+ + {{#if linking}} + + {{/if}} + + + {{#if similar}} +
+

Similar Domains

+ + + + + + + + + {{#each similar}} + + + + + + + + + {{/each}} +
MetaRankDomainSimilarity
+ {{#if indexed}} + {{#if active}} + 👀 + {{/if}} + {{#unless active}} + 🔥 + {{/unless}} + {{/if}} + + {{#if screenshot}}📷{{/if}} + + {{#if linkType.isLinked}} + {{{linkType}}} + {{/if}} + + {{{rankSymbols}}} + + {{url.domain}} + {{relatedness}}
+ +
+

Note: Because two domains are considered similar does not always mean they're in + cahoots. Similarity is a measure of how often they appear in the same contexts, + which may be an association like peas and carrots, but some pairings are also defined by their + contrasting opposition, like Sparta and Athens.

+
+ {{/if}} + +
\ No newline at end of file diff --git a/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info.hdb b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info.hdb new file mode 100644 index 00000000..5a8a3e89 --- /dev/null +++ b/code/services-application/search-service-legacy/resources/templates/search/site-info/site-info.hdb @@ -0,0 +1,58 @@ + + + + + Marginalia Search - {{domain}} + + + + + + + + +{{>search/parts/search-header}} + +{{>search/parts/search-form}} + + +{{#with view}} + +{{/with}} + +{{#if view.links}} +
+ Showing search results with links to {{domain}}. +
+ {{#each results}}{{>search/parts/search-result}}{{/each}} +{{/if}} + +{{#if view.docs}} +
+ Showing documents found in {{domain}}. +
+ + {{#each results}}{{>search/parts/search-result}}{{/each}} +{{/if}} + +{{#if view.report}} + {{>search/site-info/site-info-report}} +{{/if}} + +{{#if view.info}} + {{>search/site-info/site-info-summary}} +{{/if}} + +{{>search/parts/search-footer}} + + + diff --git a/code/services-application/search-service-legacy/test/nu/marginalia/search/command/commands/BangCommandTest.java b/code/services-application/search-service-legacy/test/nu/marginalia/search/command/commands/BangCommandTest.java new file mode 100644 index 00000000..d8d91654 --- /dev/null +++ b/code/services-application/search-service-legacy/test/nu/marginalia/search/command/commands/BangCommandTest.java @@ -0,0 +1,52 @@ +package nu.marginalia.search.command.commands; + +import nu.marginalia.search.command.SearchParameters; +import nu.marginalia.search.exceptions.RedirectException; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class BangCommandTest { + public BangCommand bangCommand = new BangCommand(); + + @Test + public void testG() { + try { + bangCommand.process(null, + new SearchParameters(" !g test", + null, null, null, null, null, false, 1) + ); + Assertions.fail("Should have thrown RedirectException"); + } + catch (RedirectException ex) { + assertEquals("https://www.google.com/search?q=test", ex.newUrl); + } + } + + @Test + public void testMatchPattern() { + var match = bangCommand.matchBangPattern("!g test", "!g"); + + assertTrue(match.isPresent()); + assertEquals(match.get(), "test"); + } + + @Test + public void testMatchPattern2() { + var match = bangCommand.matchBangPattern("test !g", "!g"); + + assertTrue(match.isPresent()); + assertEquals(match.get(), "test"); + } + + @Test + public void testMatchPattern3() { + var match = bangCommand.matchBangPattern("hello !g world", "!g"); + + assertTrue(match.isPresent()); + assertEquals(match.get(), "hello world"); + } + +} \ No newline at end of file diff --git a/code/services-application/search-service-legacy/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java b/code/services-application/search-service-legacy/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java new file mode 100644 index 00000000..a85d2dec --- /dev/null +++ b/code/services-application/search-service-legacy/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java @@ -0,0 +1,359 @@ +package nu.marginalia.search.paperdoll; + +import com.google.gson.Gson; +import com.google.inject.AbstractModule; +import com.google.inject.Guice; +import com.zaxxer.hikari.HikariConfig; +import com.zaxxer.hikari.HikariDataSource; +import nu.marginalia.api.domains.DomainInfoClient; +import nu.marginalia.api.domains.model.DomainInformation; +import nu.marginalia.api.domains.model.SimilarDomain; +import nu.marginalia.api.searchquery.QueryClient; +import nu.marginalia.api.searchquery.model.query.QueryResponse; +import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSpecification; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.gson.GsonFactory; +import nu.marginalia.screenshot.ScreenshotService; +import nu.marginalia.search.SearchModule; +import nu.marginalia.search.SearchService; +import nu.marginalia.service.ServiceId; +import nu.marginalia.service.discovery.ServiceRegistryIf; +import nu.marginalia.service.discovery.property.ServiceEndpoint; +import nu.marginalia.service.module.ServiceConfigurationModule; +import nu.marginalia.test.TestMigrationLoader; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Tag; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; +import org.testcontainers.containers.MariaDBContainer; +import org.testcontainers.junit.jupiter.Container; +import org.testcontainers.junit.jupiter.Testcontainers; +import spark.Spark; + +import java.net.URISyntaxException; +import java.sql.SQLException; +import java.util.*; +import java.util.concurrent.CompletableFuture; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.Mockito.when; + + +/** This class is a special test class that sets up a search service + * and registers some search results, without actually starting the rest + * of the environment. This is used to test the search service in isolation + * when working on the frontend. + *

+ * It's not actually a test, but it's in the test directory because it's + * using test related classes. + *

+ * When using gradle, run ./gradlew paperDoll --info to run this test, + * the system will wait for you to kill the process to stop the test, + * and the UI is available at port 9999. + */ +@Testcontainers +@Tag("paperdoll") +public class SearchServicePaperDoll extends AbstractModule { + + @Container + static MariaDBContainer mariaDBContainer = new MariaDBContainer<>("mariadb") + .withDatabaseName("WMSA_prod") + .withUsername("wmsa") + .withPassword("wmsa") + .withNetworkAliases("mariadb"); + + private static HikariDataSource dataSource; + + private static List results = new ArrayList<>(); + private static List dummyLinks = new ArrayList<>(); + private static QueryResponse searchResponse; + private static final Gson gson = GsonFactory.get(); + + void registerSearchResult( + String url, + String title, + String description, + Collection features, + double quality, + double score, + long positions) + { + try { + results.add(new DecoratedSearchResultItem( + new SearchResultItem(url.hashCode(), 2, 3, score, 0), + new EdgeUrl(url), + title, + description, + quality, + "HTML5", + HtmlFeature.encode(features), + null, + url.hashCode(), + 400, + positions, + score, + 4, + null) + ); + } + catch (Exception e) { + throw new RuntimeException(); + } + } + + @BeforeAll + public static void setup() throws URISyntaxException { + if (!Boolean.getBoolean("runPaperDoll")) { + return; + } + + HikariConfig config = new HikariConfig(); + config.setJdbcUrl(mariaDBContainer.getJdbcUrl()); + config.setUsername("wmsa"); + config.setPassword("wmsa"); + + dataSource = new HikariDataSource(config); + + TestMigrationLoader.flywayMigration(dataSource); + + System.setProperty("service-name", "search"); + System.setProperty("search.websiteUrl", "http://localhost:9999/"); + + try (var conn = dataSource.getConnection(); + var newsStmt = conn.prepareStatement(""" + INSERT INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE) + VALUES (?, ?, ?, ?) + """); + var domainStmt = conn.prepareStatement(""" + INSERT INTO EC_DOMAIN(ID, DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY) + VALUES (?, ?, ?, ?) + """); + var randomStmt = conn.prepareStatement(""" + INSERT INTO EC_RANDOM_DOMAINS(DOMAIN_ID, DOMAIN_SET) + VALUES (?, ?) + """) + ) { + newsStmt.setString(1, "Lex Luthor elected president"); + newsStmt.setString(2, "https://www.example.com/foo"); + newsStmt.setString(3, "Daily Planet"); + newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis())); + newsStmt.execute(); + + newsStmt.setString(1, "Besieged Alesian onlookers confused as Caesar builds a wall around his wall around the city walls"); + newsStmt.setString(2, "https://www.example2.com/bar"); + newsStmt.setString(3, "The Gaulish Observer"); + newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis())); + newsStmt.execute(); + + newsStmt.setString(1, "Marginalia acquires Google"); + newsStmt.setString(2, "https://www.example3.com/baz"); + newsStmt.setString(3, "The Dependent"); + newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis())); + newsStmt.execute(); + + domainStmt.setInt(1, 1); + domainStmt.setString(2, "www.example.com"); + domainStmt.setString(3, "example.com"); + domainStmt.setInt(4, 1); + domainStmt.execute(); + + domainStmt.setInt(1, 2); + domainStmt.setString(2, "www.example2.com"); + domainStmt.setString(3, "example2.com"); + domainStmt.setInt(4, 2); + domainStmt.execute(); + + domainStmt.setInt(1, 3); + domainStmt.setString(2, "www.example3.com"); + domainStmt.setString(3, "example3.com"); + domainStmt.setInt(4, 3); + domainStmt.execute(); + + randomStmt.setInt(1, 1); + randomStmt.setInt(2, 0); + randomStmt.execute(); + + randomStmt.setInt(1, 2); + randomStmt.setInt(2, 0); + randomStmt.execute(); + + randomStmt.setInt(1, 3); + randomStmt.setInt(2, 0); + randomStmt.execute(); + } catch (SQLException e) { + e.printStackTrace(); + } + + searchResponse = new QueryResponse( + new SearchSpecification(new SearchQuery(), List.of(), "", "test", + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + SpecificationLimit.none(), + new QueryLimits(10, 20, 3, 4), + QueryStrategy.AUTO, + ResultRankingParameters.sensibleDefaults() + ), + results, + List.of(), + List.of(), + 1, + 1, + null + ); + } + + @Test + public void run() throws Exception { + if (!Boolean.getBoolean("runPaperDoll")) { + return; + } + + var injector = Guice.createInjector( + new ServiceConfigurationModule(ServiceId.Search), + new SearchModule(), + this); + + injector.getInstance(SearchService.class); + + List suggestions = List.of("foo", "bar", "baz"); + + Spark.get("/suggest/", (rq, rsp) -> { + rsp.type("application/json"); + return gson.toJson(suggestions); + }); + + Spark.get("/screenshot/*", (rq, rsp) -> { + rsp.type("image/svg+xml"); + return """ + + + + + Placeholder + Lorem Ipsum As F + + + """; + }); + + registerSearchResult("https://www.example.com/foo", "Foo", "Lorem ipsum dolor sit amet", Set.of(), 0.5, 0.5, ~0L); + registerSearchResult("https://www.example2.com/bar", "Bar", "Some text goes here", Set.of(), 0.5, 0.5, 1L); + registerSearchResult("https://www.example3.com/baz", "All HTML Features", "This one's got every feature", EnumSet.allOf(HtmlFeature.class), 0.5, 0.5, 1L); + + + + + dummyLinks.add(new SimilarDomain( + new EdgeUrl("https://www.example.com/foo"), + 1, + 0.5, + 0.5, + true, + true, + true, + SimilarDomain.LinkType.FOWARD + )); + dummyLinks.add(new SimilarDomain( + new EdgeUrl("https://www.example2.com/foo"), + 2, + 0.5, + 1, + false, + false, + true, + SimilarDomain.LinkType.BACKWARD + )); + dummyLinks.add(new SimilarDomain( + new EdgeUrl("https://www.example3.com/foo"), + 3, + 0, + 0.5, + false, + false, + false, + SimilarDomain.LinkType.BIDIRECTIONAL + )); + + + for (;;); + } + + public void configure() { + try { + var serviceRegistry = Mockito.mock(ServiceRegistryIf.class); + when(serviceRegistry.registerService(any(), any(), any())).thenReturn(new ServiceEndpoint("localhost", 9999)); + + bind(ServiceRegistryIf.class).toInstance(serviceRegistry); + bind(HikariDataSource.class).toInstance(dataSource); + + var qsMock = Mockito.mock(QueryClient.class); + when(qsMock.search(any())).thenReturn(searchResponse); + bind(QueryClient.class).toInstance(qsMock); + + var asMock = Mockito.mock(DomainInfoClient.class); + + when(asMock.isAccepting()).thenReturn(true); + when(asMock.linkedDomains(anyInt(), anyInt())).thenReturn(CompletableFuture.completedFuture(dummyLinks)); + when(asMock.similarDomains(anyInt(), anyInt())).thenReturn(CompletableFuture.completedFuture(dummyLinks)); + when(asMock.domainInformation(anyInt())).thenReturn(CompletableFuture.completedFuture( + new DomainInformation(new EdgeDomain("www.example.com"), + false, + 123, + 123, + 123, + 123, + 123, + 1, + 0.5, + false, + false, + false, + "127.0.0.1", + 1, + "ACME", + "CA", + "CA", + "Exemplary") + )); + + bind(DomainInfoClient.class).toInstance(asMock); + + var sss = Mockito.mock(ScreenshotService.class); + when(sss.hasScreenshot(anyInt())).thenReturn(true); + bind(ScreenshotService.class).toInstance(sss); + } + catch (Exception e) { + throw new RuntimeException(e); + } + } + +} diff --git a/code/services-application/search-service-legacy/test/nu/marginalia/util/TestLanguageModels.java b/code/services-application/search-service-legacy/test/nu/marginalia/util/TestLanguageModels.java new file mode 100644 index 00000000..a33d32ee --- /dev/null +++ b/code/services-application/search-service-legacy/test/nu/marginalia/util/TestLanguageModels.java @@ -0,0 +1,37 @@ +package nu.marginalia.util; + +import nu.marginalia.LanguageModels; +import nu.marginalia.WmsaHome; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Optional; + +public class TestLanguageModels { + private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model"); + + public static Path getLanguageModelsPath() { + final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME")) + .map(Path::of) + .orElse(LANGUAGE_MODELS_DEFAULT); + + if (!Files.isDirectory(languageModelsHome)) { + throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md"); + } + return languageModelsHome; + } + + public static LanguageModels getLanguageModels() { + + var languageModelsHome = getLanguageModelsPath(); + + return new LanguageModels( + languageModelsHome.resolve("tfreq-new-algo3.bin"), + languageModelsHome.resolve("opennlp-sentence.bin"), + languageModelsHome.resolve("English.RDR"), + languageModelsHome.resolve("English.DICT"), + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") + ); + } +} diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchService.java b/code/services-application/search-service/java/nu/marginalia/search/SearchService.java index f5639a74..fe239a46 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchService.java @@ -17,8 +17,6 @@ import java.util.List; public class SearchService extends JoobyService { - private final WebsiteUrl websiteUrl; - private final StaticResources staticResources; private final SearchSiteSubscriptionService siteSubscriptionService; private static final Logger logger = LoggerFactory.getLogger(SearchService.class); @@ -48,7 +46,7 @@ public class SearchService extends JoobyService { throws Exception { super(params, ServicePartition.any(), - List.of(), + List.of(), // No GRPC services List.of(new SearchFrontPageService_(frontPageService), new SearchQueryService_(searchQueryService), new SearchSiteInfoService_(siteInfoService), @@ -57,9 +55,6 @@ public class SearchService extends JoobyService { new SearchBrowseService_(searchBrowseService) )); - this.websiteUrl = websiteUrl; - this.staticResources = staticResources; - this.siteSubscriptionService = siteSubscriptionService; } @@ -69,82 +64,6 @@ public class SearchService extends JoobyService { jooby.get("/export-opml", siteSubscriptionService::exportOpml); } -// -// SearchServiceMetrics.get("/search", searchQueryService::pathSearch); -// SearchServiceMetrics.get("/", frontPageService::render); -// SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed); -// -// SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling); -// -// SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir); -// -// SearchServiceMetrics.get("/site", siteInfoService::handleOverview); -// SearchServiceMetrics.get("/site/:site", siteInfoService::handle); -// SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost); -// -// SearchServiceMetrics.get("/explore", searchBrowseService::handleBrowseRandom); -// SearchServiceMetrics.get("/explore/:site", searchBrowseService::handleBrowseSite); -// -// SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle); -// -// SearchServiceMetrics.get("/:resource", this::serveStatic); -// Spark.exception(Exception.class, (e,p,q) -> { -// logger.error("Error during processing", e); -// wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc(); -// errorPageService.serveError(p, q); -// }); -// -// // Add compression -// Spark.after((rq, rs) -> { -// rs.header("Content-Encoding", "gzip"); -// }); -// -// Spark.awaitInitialization(); -// -// -// /** Wraps a route with a timer and a counter */ -// private static class SearchServiceMetrics implements Route { -// private final Route delegatedRoute; -// -// static void get(String path, Route route) { -// Spark.get(path, new SearchServiceMetrics(route)); -// } -// static void post(String path, Route route) { -// Spark.post(path, new SearchServiceMetrics(route)); -// } -// -// private SearchServiceMetrics(Route delegatedRoute) { -// this.delegatedRoute = delegatedRoute; -// } -// -// @Override -// public Object handle(Request request, Response response) throws Exception { -// return wmsa_search_service_request_time -// .labels(request.matchedPath(), request.requestMethod()) -// .time(() -> delegatedRoute.handle(request, response)); -// } -// } -// -// private Object serveStatic(Request request, Response response) { -// String resource = request.params("resource"); -// staticResources.serveStatic("search", resource, request, response); -// return ""; -// } -// -// private Object siteSearchRedir(Request request, Response response) { -// final String site = request.params("site"); -// final String searchTerms; -// -// if (request.splat().length == 0) searchTerms = ""; -// else searchTerms = request.splat()[0]; -// -// final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim(); -// final String profile = request.queryParamOrDefault("profile", "yolo"); -// -// response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile)); -// -// return ""; -// } } diff --git a/settings.gradle b/settings.gradle index 04f94630..3abe3512 100644 --- a/settings.gradle +++ b/settings.gradle @@ -8,6 +8,7 @@ include 'code:services-core:executor-service' include 'code:services-core:single-service-runner' include 'code:services-application:search-service' +include 'code:services-application:search-service-legacy' include 'code:services-application:api-service' include 'code:services-application:dating-service' include 'code:services-application:explorer-service'