(search) Add a copy of the old UI as a separate service, search-service-legacy

This commit is contained in:
Viktor Lofgren 2025-01-02 18:02:17 +01:00
parent 8b05c788fd
commit 1b27c5cf06
86 changed files with 6103 additions and 82 deletions

View File

@ -0,0 +1,94 @@
plugins {
id 'java'
id 'io.freefair.sass-base' version '8.4'
id 'io.freefair.sass-java' version '8.4'
id 'application'
id 'jvm-test-suite'
id 'com.google.cloud.tools.jib' version '3.4.3'
}
application {
mainClass = 'nu.marginalia.search.SearchMain'
applicationName = 'search-service-legacy'
}
tasks.distZip.enabled = false
java {
toolchain {
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
}
}
sass {
sourceMapEnabled = true
sourceMapEmbed = true
outputStyle = EXPANDED
}
apply from: "$rootProject.projectDir/srcsets.gradle"
apply from: "$rootProject.projectDir/docker.gradle"
dependencies {
implementation project(':code:common:db')
implementation project(':code:common:model')
implementation project(':code:common:service')
implementation project(':code:common:config')
implementation project(':code:index:query')
implementation project(':code:libraries:easy-lsh')
implementation project(':code:libraries:language-processing')
implementation project(':code:libraries:braille-block-punch-cards')
implementation project(':code:libraries:term-frequency-dict')
implementation project(':code:functions:live-capture:api')
implementation project(':code:functions:math:api')
implementation project(':code:functions:domain-info:api')
implementation project(':code:functions:search-query:api')
implementation project(':code:index:api')
implementation project(':code:common:renderer')
implementation project(':code:features-search:screenshots')
implementation project(':code:features-search:random-websites')
implementation libs.bundles.slf4j
implementation libs.roaringbitmap
implementation libs.prometheus
implementation libs.notnull
implementation libs.guava
implementation dependencies.create(libs.guice.get()) {
exclude group: 'com.google.guava'
}
implementation libs.handlebars
implementation dependencies.create(libs.spark.get()) {
exclude group: 'org.eclipse.jetty'
}
implementation libs.bundles.jetty
implementation libs.opencsv
implementation libs.trove
implementation libs.fastutil
implementation libs.bundles.gson
implementation libs.bundles.mariadb
implementation libs.bundles.nlp
testImplementation libs.bundles.slf4j.test
testImplementation libs.bundles.junit
testImplementation libs.mockito
testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4')
testImplementation libs.commons.codec
testImplementation 'org.testcontainers:mariadb:1.17.4'
testImplementation 'org.testcontainers:junit-jupiter:1.17.4'
testImplementation project(':code:libraries:test-helpers')
}
tasks.register('paperDoll', Test) {
useJUnitPlatform {
includeTags "paperdoll"
}
jvmArgs = [ '-DrunPaperDoll=true', '--enable-preview' ]
}

View File

@ -0,0 +1,12 @@
package nu.marginalia.search;
import com.github.jknack.handlebars.Handlebars;
import nu.marginalia.renderer.config.HandlebarsConfigurator;
public class SearchHandlebarsConfigurator implements HandlebarsConfigurator {
@Override
public void configure(Handlebars handlebars) {
}
}

View File

@ -0,0 +1,47 @@
package nu.marginalia.search;
import com.google.inject.Guice;
import com.google.inject.Inject;
import com.google.inject.Injector;
import nu.marginalia.service.MainClass;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.module.ServiceConfiguration;
import nu.marginalia.service.module.ServiceDiscoveryModule;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.service.module.DatabaseModule;
import nu.marginalia.service.server.Initialization;
import spark.Spark;
public class SearchMain extends MainClass {
private final SearchService service;
@Inject
public SearchMain(SearchService service) {
this.service = service;
}
public static void main(String... args) {
init(ServiceId.Search, args);
Spark.staticFileLocation("/static/search/");
Injector injector = Guice.createInjector(
new SearchModule(),
new ServiceConfigurationModule(ServiceId.Search),
new ServiceDiscoveryModule(),
new DatabaseModule(false)
);
// Orchestrate the boot order for the services
var registry = injector.getInstance(ServiceRegistryIf.class);
var configuration = injector.getInstance(ServiceConfiguration.class);
orchestrateBoot(registry, configuration);
injector.getInstance(SearchMain.class);
injector.getInstance(Initialization.class).setReady();
}
}

View File

@ -0,0 +1,20 @@
package nu.marginalia.search;
import com.google.inject.AbstractModule;
import nu.marginalia.LanguageModels;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.WmsaHome;
import nu.marginalia.renderer.config.HandlebarsConfigurator;
public class SearchModule extends AbstractModule {
public void configure() {
bind(HandlebarsConfigurator.class).to(SearchHandlebarsConfigurator.class);
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
bind(WebsiteUrl.class).toInstance(new WebsiteUrl(
System.getProperty("search.websiteUrl", "https://search.marginalia.nu/")));
}
}

View File

@ -0,0 +1,266 @@
package nu.marginalia.search;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.math.MathClient;
import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.DecoratedSearchResults;
import nu.marginalia.search.model.SearchFilters;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.results.UrlDeduplicator;
import nu.marginalia.search.svc.SearchQueryCountService;
import nu.marginalia.search.svc.SearchUnitConversionService;
import org.apache.logging.log4j.util.Strings;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import javax.annotation.Nullable;
import java.time.Duration;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
@Singleton
public class SearchOperator {
private static final Logger logger = LoggerFactory.getLogger(SearchOperator.class);
// Marker for filtering out sensitive content from the persistent logs
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
private final MathClient mathClient;
private final DbDomainQueries domainQueries;
private final QueryClient queryClient;
private final SearchQueryParamFactory paramFactory;
private final WebsiteUrl websiteUrl;
private final SearchUnitConversionService searchUnitConversionService;
private final SearchQueryCountService searchVisitorCount;
@Inject
public SearchOperator(MathClient mathClient,
DbDomainQueries domainQueries,
QueryClient queryClient,
SearchQueryParamFactory paramFactory,
WebsiteUrl websiteUrl,
SearchUnitConversionService searchUnitConversionService,
SearchQueryCountService searchVisitorCount
)
{
this.mathClient = mathClient;
this.domainQueries = domainQueries;
this.queryClient = queryClient;
this.paramFactory = paramFactory;
this.websiteUrl = websiteUrl;
this.searchUnitConversionService = searchUnitConversionService;
this.searchVisitorCount = searchVisitorCount;
}
public List<UrlDetails> doSiteSearch(String domain,
int domainId,
int count) {
var queryParams = paramFactory.forSiteSearch(domain, domainId, count);
var queryResponse = queryClient.search(queryParams);
return getResultsFromQuery(queryResponse);
}
public List<UrlDetails> doBacklinkSearch(String domain) {
var queryParams = paramFactory.forBacklinkSearch(domain);
var queryResponse = queryClient.search(queryParams);
return getResultsFromQuery(queryResponse);
}
public List<UrlDetails> doLinkSearch(String source, String dest) {
var queryParams = paramFactory.forLinkSearch(source, dest);
var queryResponse = queryClient.search(queryParams);
return getResultsFromQuery(queryResponse);
}
public DecoratedSearchResults doSearch(SearchParameters userParams) throws InterruptedException {
// The full user-facing search query does additional work to try to evaluate the query
// e.g. as a unit conversion query. This is done in parallel with the regular search.
Future<String> eval = searchUnitConversionService.tryEval(userParams.query());
// Perform the regular search
var queryParams = paramFactory.forRegularSearch(userParams);
QueryResponse queryResponse = queryClient.search(queryParams);
var queryResults = getResultsFromQuery(queryResponse);
// Cluster the results based on the query response
List<ClusteredUrlDetails> clusteredResults = SearchResultClusterer
.selectStrategy(queryResponse)
.clusterResults(queryResults, 25);
// Log the query and results
logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ','));
logger.info(queryMarker, "Search Result Count: {}", queryResults.size());
// Get the evaluation result and other data to return to the user
String evalResult = getFutureOrDefault(eval, "");
String focusDomain = queryResponse.domain();
int focusDomainId = focusDomain == null
? -1
: domainQueries.tryGetDomainId(new EdgeDomain(focusDomain)).orElse(-1);
List<String> problems = getProblems(evalResult, queryResults, queryResponse);
List<DecoratedSearchResults.Page> resultPages = IntStream.rangeClosed(1, queryResponse.totalPages())
.mapToObj(number -> new DecoratedSearchResults.Page(
number,
number == userParams.page(),
userParams.withPage(number).renderUrl(websiteUrl)
))
.toList();
// Return the results to the user
return DecoratedSearchResults.builder()
.params(userParams)
.problems(problems)
.evalResult(evalResult)
.results(clusteredResults)
.filters(new SearchFilters(websiteUrl, userParams))
.focusDomain(focusDomain)
.focusDomainId(focusDomainId)
.resultPages(resultPages)
.build();
}
public List<UrlDetails> getResultsFromQuery(QueryResponse queryResponse) {
final QueryLimits limits = queryResponse.specs().queryLimits;
final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain());
// Update the query count (this is what you see on the front page)
searchVisitorCount.registerQuery();
return queryResponse.results().stream()
.filter(deduplicator::shouldRetain)
.limit(limits.resultsTotal())
.map(SearchOperator::createDetails)
.toList();
}
private static UrlDetails createDetails(DecoratedSearchResultItem item) {
return new UrlDetails(
item.documentId(),
item.domainId(),
cleanUrl(item.url),
item.title,
item.description,
item.format,
item.features,
DomainIndexingState.ACTIVE,
item.rankingScore, // termScore
item.resultsFromDomain,
BrailleBlockPunchCards.printBits(item.bestPositions, 64),
Long.bitCount(item.bestPositions),
item.rawIndexResult,
item.rawIndexResult.keywordScores
);
}
/** Replace nuisance domains with replacements where available */
private static EdgeUrl cleanUrl(EdgeUrl url) {
String topdomain = url.domain.topDomain;
String subdomain = url.domain.subDomain;
String path = url.path;
if (topdomain.equals("fandom.com")) {
int wikiIndex = path.indexOf("/wiki/");
if (wikiIndex >= 0) {
return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null);
}
}
else if (topdomain.equals("medium.com")) {
if (!subdomain.isBlank()) {
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null);
}
else {
String article = path.substring(path.indexOf("/", 1));
return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null);
}
}
return url;
}
private List<String> getProblems(String evalResult, List<UrlDetails> queryResults, QueryResponse response) throws InterruptedException {
// We don't debug the query if it's a site search
if (response.domain() == null)
return List.of();
final List<String> problems = new ArrayList<>(response.problems());
if (queryResults.size() <= 5 && null == evalResult) {
problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results.");
// Try to spell check the search terms
var suggestions = getFutureOrDefault(
mathClient.spellCheck(response.searchTermsHuman()),
Map.of()
);
suggestions.forEach((term, suggestion) -> {
if (suggestion.size() > 1) {
String suggestionsStr = "\"%s\" could be spelled %s".formatted(term, suggestion.stream().map(s -> "\"" + s + "\"").collect(Collectors.joining(", ")));
problems.add(suggestionsStr);
}
});
}
Set<String> representativeKeywords = response.getAllKeywords();
if (representativeKeywords.size() > 1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning")))
{
problems.add("Tip: Try using a query that looks like <tt>define:word</tt> if you want a dictionary definition");
}
return problems;
}
private <T> T getFutureOrDefault(@Nullable Future<T> fut, T defaultValue) {
return getFutureOrDefault(fut, Duration.ofMillis(50), defaultValue);
}
private <T> T getFutureOrDefault(@Nullable Future<T> fut, Duration timeout, T defaultValue) {
if (fut == null || fut.isCancelled()) {
return defaultValue;
}
try {
return fut.get(timeout.toMillis(), TimeUnit.MILLISECONDS);
}
catch (Exception ex) {
logger.warn("Error fetching eval result", ex);
return defaultValue;
}
}
}

View File

@ -0,0 +1,104 @@
package nu.marginalia.search;
import nu.marginalia.api.searchquery.model.query.QueryParams;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.search.command.SearchParameters;
import java.util.List;
public class SearchQueryParamFactory {
public QueryParams forRegularSearch(SearchParameters userParams) {
SearchQuery prototype = new SearchQuery();
var profile = userParams.profile();
profile.addTacitTerms(prototype);
userParams.js().addTacitTerms(prototype);
userParams.adtech().addTacitTerms(prototype);
return new QueryParams(
userParams.query(),
null,
prototype.searchTermsInclude,
prototype.searchTermsExclude,
prototype.searchTermsPriority,
prototype.searchTermsAdvice,
profile.getQualityLimit(),
profile.getYearLimit(),
profile.getSizeLimit(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(5, 100, 200, 8192),
profile.searchSetIdentifier.name(),
userParams.strategy(),
userParams.temporalBias(),
userParams.page()
);
}
public QueryParams forSiteSearch(String domain, int domainId, int count) {
return new QueryParams("site:"+domain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(domainId),
new QueryLimits(count, count, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
ResultRankingParameters.TemporalBias.NONE,
1
);
}
public QueryParams forBacklinkSearch(String domain) {
return new QueryParams("links:"+domain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
ResultRankingParameters.TemporalBias.NONE,
1
);
}
public QueryParams forLinkSearch(String sourceDomain, String destDomain) {
return new QueryParams("site:" + sourceDomain + " links:" + destDomain,
null,
List.of(),
List.of(),
List.of(),
List.of(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
List.of(),
new QueryLimits(100, 100, 100, 512),
SearchSetIdentifier.NONE.name(),
QueryStrategy.AUTO,
ResultRankingParameters.TemporalBias.NONE,
1
);
}
}

View File

@ -0,0 +1,53 @@
package nu.marginalia.search;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.search.model.ClusteredUrlDetails;
import nu.marginalia.search.model.UrlDetails;
import java.util.List;
import java.util.stream.Collectors;
/** Functions for clustering search results */
public class SearchResultClusterer {
private SearchResultClusterer() {}
public interface SearchResultClusterStrategy {
List<ClusteredUrlDetails> clusterResults(List<UrlDetails> results, int total);
}
public static SearchResultClusterStrategy selectStrategy(QueryResponse response) {
if (response.domain() != null && !response.domain().isBlank())
return SearchResultClusterer::noOp;
return SearchResultClusterer::byDomain;
}
/** No clustering, just return the results as is */
private static List<ClusteredUrlDetails> noOp(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.map(ClusteredUrlDetails::new)
.toList();
}
/** Cluster the results by domain, and return the top "total" clusters
* sorted by the relevance of the best result
*/
private static List<ClusteredUrlDetails> byDomain(List<UrlDetails> results, int total) {
if (results.isEmpty())
return List.of();
return results.stream()
.collect(
Collectors.groupingBy(details -> details.domainId)
)
.values().stream()
.map(ClusteredUrlDetails::new)
.sorted()
.limit(total)
.toList();
}
}

View File

@ -0,0 +1,128 @@
package nu.marginalia.search;
import com.google.inject.Inject;
import io.prometheus.client.Counter;
import io.prometheus.client.Histogram;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.svc.*;
import nu.marginalia.service.server.BaseServiceParams;
import nu.marginalia.service.server.SparkService;
import nu.marginalia.service.server.StaticResources;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Route;
import spark.Spark;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
public class SearchService extends SparkService {
private final WebsiteUrl websiteUrl;
private final StaticResources staticResources;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
private static final Histogram wmsa_search_service_request_time = Histogram.build()
.name("wmsa_search_service_request_time")
.linearBuckets(0.05, 0.05, 15)
.labelNames("matchedPath", "method")
.help("Search service request time (seconds)")
.register();
private static final Counter wmsa_search_service_error_count = Counter.build()
.name("wmsa_search_service_error_count")
.labelNames("matchedPath", "method")
.help("Search service error count")
.register();
@Inject
public SearchService(BaseServiceParams params,
WebsiteUrl websiteUrl,
StaticResources staticResources,
SearchFrontPageService frontPageService,
SearchErrorPageService errorPageService,
SearchAddToCrawlQueueService addToCrawlQueueService,
SearchSiteInfoService siteInfoService,
SearchCrosstalkService crosstalkService,
SearchQueryService searchQueryService)
throws Exception
{
super(params);
this.websiteUrl = websiteUrl;
this.staticResources = staticResources;
Spark.staticFiles.expireTime(600);
SearchServiceMetrics.get("/search", searchQueryService::pathSearch);
SearchServiceMetrics.get("/", frontPageService::render);
SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed);
SearchServiceMetrics.get("/:resource", this::serveStatic);
SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling);
SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir);
SearchServiceMetrics.get("/site/:site", siteInfoService::handle);
SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost);
SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle);
Spark.exception(Exception.class, (e,p,q) -> {
logger.error("Error during processing", e);
wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc();
errorPageService.serveError(p, q);
});
Spark.awaitInitialization();
}
/** Wraps a route with a timer and a counter */
private static class SearchServiceMetrics implements Route {
private final Route delegatedRoute;
static void get(String path, Route route) {
Spark.get(path, new SearchServiceMetrics(route));
}
static void post(String path, Route route) {
Spark.post(path, new SearchServiceMetrics(route));
}
private SearchServiceMetrics(Route delegatedRoute) {
this.delegatedRoute = delegatedRoute;
}
@Override
public Object handle(Request request, Response response) throws Exception {
return wmsa_search_service_request_time
.labels(request.matchedPath(), request.requestMethod())
.time(() -> delegatedRoute.handle(request, response));
}
}
private Object serveStatic(Request request, Response response) {
String resource = request.params("resource");
staticResources.serveStatic("search", resource, request, response);
return "";
}
private Object siteSearchRedir(Request request, Response response) {
final String site = request.params("site");
final String searchTerms;
if (request.splat().length == 0) searchTerms = "";
else searchTerms = request.splat()[0];
final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim();
final String profile = request.queryParamOrDefault("profile", "yolo");
response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
return "";
}
}

View File

@ -0,0 +1,43 @@
package nu.marginalia.search.command;
import com.google.inject.Inject;
import nu.marginalia.search.command.commands.*;
import spark.Response;
import java.util.ArrayList;
import java.util.List;
public class CommandEvaluator {
private final List<SearchCommandInterface> specialCommands = new ArrayList<>();
private final SearchCommand defaultCommand;
@Inject
public CommandEvaluator(
BrowseCommand browse,
ConvertCommand convert,
DefinitionCommand define,
BangCommand bang,
SiteRedirectCommand siteRedirect,
SearchCommand search
) {
specialCommands.add(browse);
specialCommands.add(convert);
specialCommands.add(define);
specialCommands.add(bang);
specialCommands.add(siteRedirect);
defaultCommand = search;
}
public Object eval(Response response, SearchParameters parameters) {
for (var cmd : specialCommands) {
var maybe = cmd.process(response, parameters);
if (maybe.isPresent())
return maybe.get();
}
return defaultCommand.process(response, parameters).orElse("");
}
}

View File

@ -0,0 +1,29 @@
package nu.marginalia.search.command;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import javax.annotation.Nullable;
import java.util.Arrays;
public enum SearchAdtechParameter {
DEFAULT("default"),
REDUCE("reduce", "special:ads", "special:affiliate");
public final String value;
public final String[] implictExcludeSearchTerms;
SearchAdtechParameter(String value, String... implictExcludeSearchTerms) {
this.value = value;
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
}
public static SearchAdtechParameter parse(@Nullable String value) {
if (REDUCE.value.equals(value)) return REDUCE;
return DEFAULT;
}
public void addTacitTerms(SearchQuery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -0,0 +1,10 @@
package nu.marginalia.search.command;
import spark.Response;
import java.util.Optional;
public interface SearchCommandInterface {
Optional<Object> process(Response response, SearchParameters parameters);
}

View File

@ -0,0 +1,31 @@
package nu.marginalia.search.command;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import javax.annotation.Nullable;
import java.util.Arrays;
public enum SearchJsParameter {
DEFAULT("default"),
DENY_JS("no-js", "js:true"),
REQUIRE_JS("yes-js", "js:false");
public final String value;
public final String[] implictExcludeSearchTerms;
SearchJsParameter(String value, String... implictExcludeSearchTerms) {
this.value = value;
this.implictExcludeSearchTerms = implictExcludeSearchTerms;
}
public static SearchJsParameter parse(@Nullable String value) {
if (DENY_JS.value.equals(value)) return DENY_JS;
if (REQUIRE_JS.value.equals(value)) return REQUIRE_JS;
return DEFAULT;
}
public void addTacitTerms(SearchQuery subquery) {
subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms));
}
}

View File

@ -0,0 +1,106 @@
package nu.marginalia.search.command;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.search.model.SearchProfile;
import spark.Request;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.Objects;
import static nu.marginalia.search.command.SearchRecentParameter.RECENT;
public record SearchParameters(String query,
SearchProfile profile,
SearchJsParameter js,
SearchRecentParameter recent,
SearchTitleParameter searchTitle,
SearchAdtechParameter adtech,
boolean newFilter,
int page
) {
public SearchParameters(String queryString, Request request) {
this(
queryString,
SearchProfile.getSearchProfile(request.queryParams("profile")),
SearchJsParameter.parse(request.queryParams("js")),
SearchRecentParameter.parse(request.queryParams("recent")),
SearchTitleParameter.parse(request.queryParams("searchTitle")),
SearchAdtechParameter.parse(request.queryParams("adtech")),
"true".equals(request.queryParams("newfilter")),
Integer.parseInt(Objects.requireNonNullElse(request.queryParams("page"), "1"))
);
}
public String profileStr() {
return profile.filterId;
}
public SearchParameters withProfile(SearchProfile profile) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withJs(SearchJsParameter js) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withAdtech(SearchAdtechParameter adtech) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withRecent(SearchRecentParameter recent) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, true, page);
}
public SearchParameters withTitle(SearchTitleParameter title) {
return new SearchParameters(query, profile, js, recent, title, adtech, true, page);
}
public SearchParameters withPage(int page) {
return new SearchParameters(query, profile, js, recent, searchTitle, adtech, false, page);
}
public String renderUrl(WebsiteUrl baseUrl) {
String path = String.format("/search?query=%s&profile=%s&js=%s&adtech=%s&recent=%s&searchTitle=%s&newfilter=%s&page=%d",
URLEncoder.encode(query, StandardCharsets.UTF_8),
URLEncoder.encode(profile.filterId, StandardCharsets.UTF_8),
URLEncoder.encode(js.value, StandardCharsets.UTF_8),
URLEncoder.encode(adtech.value, StandardCharsets.UTF_8),
URLEncoder.encode(recent.value, StandardCharsets.UTF_8),
URLEncoder.encode(searchTitle.value, StandardCharsets.UTF_8),
Boolean.valueOf(newFilter).toString(),
page
);
return baseUrl.withPath(path);
}
public ResultRankingParameters.TemporalBias temporalBias() {
if (recent == RECENT) {
return ResultRankingParameters.TemporalBias.RECENT;
}
else if (profile == SearchProfile.VINTAGE) {
return ResultRankingParameters.TemporalBias.OLD;
}
return ResultRankingParameters.TemporalBias.NONE;
}
public QueryStrategy strategy() {
if (searchTitle == SearchTitleParameter.TITLE) {
return QueryStrategy.REQUIRE_FIELD_TITLE;
}
return QueryStrategy.AUTO;
}
public SpecificationLimit yearLimit() {
if (recent == RECENT)
return SpecificationLimit.greaterThan(2018);
return profile.getYearLimit();
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.search.command;
import javax.annotation.Nullable;
public enum SearchRecentParameter {
DEFAULT("default"),
RECENT("recent");
public final String value;
SearchRecentParameter(String value) {
this.value = value;
}
public static SearchRecentParameter parse(@Nullable String value) {
if (RECENT.value.equals(value)) return RECENT;
return DEFAULT;
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.search.command;
import javax.annotation.Nullable;
public enum SearchTitleParameter {
DEFAULT("default"),
TITLE("title");
public final String value;
SearchTitleParameter(String value) {
this.value = value;
}
public static SearchTitleParameter parse(@Nullable String value) {
if (TITLE.value.equals(value)) return TITLE;
return DEFAULT;
}
}

View File

@ -0,0 +1,104 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.exceptions.RedirectException;
import spark.Response;
import java.net.URLEncoder;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
public class BangCommand implements SearchCommandInterface {
private final Map<String, String> bangsToPattern = new HashMap<>();
@Inject
public BangCommand()
{
bangsToPattern.put("!g", "https://www.google.com/search?q=%s");
bangsToPattern.put("!ddg", "https://duckduckgo.com/?q=%s");
bangsToPattern.put("!w", "https://search.marginalia.nu/search?query=%s+site:en.wikipedia.org&profile=wiki");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
for (var entry : bangsToPattern.entrySet()) {
String bangPattern = entry.getKey();
String redirectPattern = entry.getValue();
var match = matchBangPattern(parameters.query(), bangPattern);
if (match.isPresent()) {
var url = String.format(redirectPattern, URLEncoder.encode(match.get(), StandardCharsets.UTF_8));
throw new RedirectException(url);
}
}
return Optional.empty();
}
/** If the query contains the bang pattern bangKey, return the query with the bang pattern removed. */
Optional<String> matchBangPattern(String query, String bangKey) {
var bm = new BangMatcher(query);
while (bm.findNext(bangKey)) {
if (!bm.isRelativeSpaceOrInvalid(-1))
continue;
if (!bm.isRelativeSpaceOrInvalid(bangKey.length()))
continue;
String prefix = bm.prefix().trim();
String suffix = bm.suffix(bangKey.length()).trim();
String ret = (prefix + " " + suffix).trim();
return Optional.of(ret)
.filter(s -> !s.isBlank());
}
return Optional.empty();
}
private static class BangMatcher {
private final String str;
private int pos;
public String prefix() {
return str.substring(0, pos);
}
public String suffix(int offset) {
if (pos+offset < str.length())
return str.substring(pos + offset);
return "";
}
public BangMatcher(String str) {
this.str = str;
this.pos = -1;
}
public boolean findNext(String pattern) {
if (pos + 1 >= str.length())
return false;
return (pos = str.indexOf(pattern, pos + 1)) >= 0;
}
public boolean isRelativeSpaceOrInvalid(int offset) {
if (offset + pos < 0)
return true;
if (offset + pos >= str.length())
return true;
return Character.isSpaceChar(str.charAt(offset + pos));
}
}
}

View File

@ -0,0 +1,78 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.svc.SearchBrowseService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Response;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class BrowseCommand implements SearchCommandInterface {
private final SearchBrowseService browseService;
private final MustacheRenderer<BrowseResultSet> browseResultsRenderer;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Predicate<String> queryPatternPredicate = Pattern.compile("^browse:[.A-Za-z\\-0-9:]+$").asPredicate();
@Inject
public BrowseCommand(SearchBrowseService browseService,
RendererFactory rendererFactory)
throws IOException
{
this.browseService = browseService;
browseResultsRenderer = rendererFactory.renderer("search/browse-results");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
if (!queryPatternPredicate.test(parameters.query())) {
return Optional.empty();
}
var model = browseSite(parameters.query());
if (null == model)
return Optional.empty();
return Optional.of(browseResultsRenderer.render(model,
Map.of("query", parameters.query(),
"profile", parameters.profileStr(),
"focusDomain", model.focusDomain())
));
}
private BrowseResultSet browseSite(String humanQuery) {
String definePrefix = "browse:";
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
try {
if ("random".equals(word)) {
return browseService.getRandomEntries(0);
}
if (word.startsWith("random:")) {
int set = Integer.parseInt(word.split(":")[1]);
return browseService.getRandomEntries(set);
}
else {
return browseService.getRelatedEntries(word);
}
}
catch (Exception ex) {
logger.info("No Results");
return null;
}
}
}

View File

@ -0,0 +1,36 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.svc.SearchUnitConversionService;
import spark.Response;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
public class ConvertCommand implements SearchCommandInterface {
private final SearchUnitConversionService searchUnitConversionService;
private final MustacheRenderer<Map<String, String>> conversionRenderer;
@Inject
public ConvertCommand(SearchUnitConversionService searchUnitConversionService, RendererFactory rendererFactory) throws IOException {
this.searchUnitConversionService = searchUnitConversionService;
conversionRenderer = rendererFactory.renderer("search/conversion-results");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
var conversion = searchUnitConversionService.tryConversion(parameters.query());
return conversion.map(s -> conversionRenderer.render(Map.of(
"query", parameters.query(),
"result", s,
"profile", parameters.profileStr())
));
}
}

View File

@ -0,0 +1,70 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.api.math.MathClient;
import nu.marginalia.api.math.model.DictionaryResponse;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.renderer.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Response;
import java.io.IOException;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class DefinitionCommand implements SearchCommandInterface {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final MustacheRenderer<DictionaryResponse> dictionaryRenderer;
private final MathClient mathClient;
private final Predicate<String> queryPatternPredicate = Pattern.compile("^define:[A-Za-z\\s-0-9]+$").asPredicate();
@Inject
public DefinitionCommand(RendererFactory rendererFactory, MathClient mathClient)
throws IOException
{
dictionaryRenderer = rendererFactory.renderer("search/dictionary-results");
this.mathClient = mathClient;
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
if (!queryPatternPredicate.test(parameters.query())) {
return Optional.empty();
}
var results = lookupDefinition(parameters.query());
return Optional.of(dictionaryRenderer.render(results,
Map.of("query", parameters.query(),
"profile", parameters.profileStr())
));
}
private DictionaryResponse lookupDefinition(String humanQuery) {
String definePrefix = "define:";
String word = humanQuery.substring(definePrefix.length()).toLowerCase();
try {
return mathClient
.dictionaryLookup(word)
.get(250, TimeUnit.MILLISECONDS);
}
catch (Exception e) {
logger.error("Failed to lookup definition for word: " + word, e);
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.model.DecoratedSearchResults;
import spark.Response;
import java.io.IOException;
import java.util.Optional;
public class SearchCommand implements SearchCommandInterface {
private final SearchOperator searchOperator;
private final MustacheRenderer<DecoratedSearchResults> searchResultsRenderer;
@Inject
public SearchCommand(SearchOperator searchOperator,
RendererFactory rendererFactory) throws IOException {
this.searchOperator = searchOperator;
searchResultsRenderer = rendererFactory.renderer("search/search-results");
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
try {
DecoratedSearchResults results = searchOperator.doSearch(parameters);
return Optional.of(searchResultsRenderer.render(results));
}
catch (InterruptedException ex) {
Thread.currentThread().interrupt();
return Optional.empty();
}
}
}

View File

@ -0,0 +1,50 @@
package nu.marginalia.search.command.commands;
import com.google.inject.Inject;
import nu.marginalia.search.command.SearchCommandInterface;
import nu.marginalia.search.command.SearchParameters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Response;
import java.util.Optional;
import java.util.function.Predicate;
import java.util.regex.Pattern;
public class SiteRedirectCommand implements SearchCommandInterface {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Predicate<String> queryPatternPredicate = Pattern.compile("^(site|links):[.A-Za-z\\-0-9]+$").asPredicate();
@Inject
public SiteRedirectCommand() {
}
@Override
public Optional<Object> process(Response response, SearchParameters parameters) {
if (!queryPatternPredicate.test(parameters.query())) {
return Optional.empty();
}
int idx = parameters.query().indexOf(':');
String prefix = parameters.query().substring(0, idx);
String domain = parameters.query().substring(idx + 1).toLowerCase();
// Use an HTML redirect here, so we can use relative URLs
String view = switch (prefix) {
case "links" -> "links";
default -> "info";
};
return Optional.of("""
<!DOCTYPE html>
<html lang="en">
<meta charset="UTF-8">
<title>Redirecting...</title>
<meta http-equiv="refresh" content="0; url=/site/%s?view=%s">
""".formatted(domain, view)
);
}
}

View File

@ -0,0 +1,66 @@
package nu.marginalia.search.db;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.ResultSet;
import java.util.ArrayList;
import java.util.List;
import java.util.function.Consumer;
public class DbNearDomainsQuery {
private final HikariDataSource dataSource;
@Inject
public DbNearDomainsQuery(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public List<Integer> getRelatedDomains(String term, Consumer<String> onProblem) {
List<Integer> ret = new ArrayList<>();
try (var conn = dataSource.getConnection();
var selfStmt = conn.prepareStatement("""
SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=?
""");
var stmt = conn.prepareStatement("""
SELECT NEIGHBOR_ID, ND.INDEXED, ND.STATE FROM EC_DOMAIN_NEIGHBORS_2
INNER JOIN EC_DOMAIN ND ON ND.ID=NEIGHBOR_ID
WHERE DOMAIN_ID=?
""")) {
ResultSet rsp;
selfStmt.setString(1, term);
rsp = selfStmt.executeQuery();
int domainId = -1;
if (rsp.next()) {
domainId = rsp.getInt(1);
ret.add(domainId);
}
stmt.setInt(1, domainId);
rsp = stmt.executeQuery();
while (rsp.next()) {
int id = rsp.getInt(1);
int indexed = rsp.getInt(2);
String state = rsp.getString(3);
if (indexed > 0 && ("ACTIVE".equalsIgnoreCase(state) || "SOCIAL_MEDIA".equalsIgnoreCase(state) || "SPECIAL".equalsIgnoreCase(state))) {
ret.add(id);
}
}
}
catch (Exception ex) {
throw new RuntimeException(ex);
}
if (ret.isEmpty()) {
onProblem.accept("Could not find domains adjacent " + term);
}
return ret;
}
}

View File

@ -0,0 +1,14 @@
package nu.marginalia.search.exceptions;
public class RedirectException extends RuntimeException {
public final String newUrl;
public RedirectException(String newUrl) {
this.newUrl = newUrl;
}
@Override
public StackTraceElement[] getStackTrace() {
return new StackTraceElement[0];
}
}

View File

@ -0,0 +1,102 @@
package nu.marginalia.search.model;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.idx.WordFlags;
import org.jetbrains.annotations.NotNull;
import java.util.*;
/** A class to hold a list of UrlDetails, grouped by domain, where the first one is the main result
* and the rest are additional results, for summary display. */
public class ClusteredUrlDetails implements Comparable<ClusteredUrlDetails> {
@NotNull
public final UrlDetails first;
@NotNull
public final List<UrlDetails> rest;
/** Create a new ClusteredUrlDetails from a collection of UrlDetails,
* with the best result as "first", and the others, in descending order
* of quality as the "rest"...
*
* @param details A collection of UrlDetails, which must not be empty.
*/
public ClusteredUrlDetails(Collection<UrlDetails> details) {
var items = new ArrayList<>(details);
items.sort(Comparator.naturalOrder());
if (items.isEmpty())
throw new IllegalArgumentException("Empty list of details");
this.first = items.removeFirst();
this.rest = items;
double bestScore = first.termScore;
double scoreLimit = Math.min(4.0, bestScore * 1.25);
this.rest.removeIf(urlDetail -> {
if (urlDetail.termScore > scoreLimit)
return false;
for (var keywordScore : urlDetail.resultItem.keywordScores) {
if (keywordScore.isKeywordSpecial())
continue;
if (keywordScore.hasTermFlag(WordFlags.Title))
return false;
if (keywordScore.hasTermFlag(WordFlags.ExternalLink))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlDomain))
return false;
if (keywordScore.hasTermFlag(WordFlags.UrlPath))
return false;
if (keywordScore.hasTermFlag(WordFlags.Subjects))
return false;
}
return true;
});
}
public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) {
this.first = onlyFirst;
this.rest = Collections.emptyList();
}
// For renderer use, do not remove
public @NotNull UrlDetails getFirst() {
return first;
}
// For renderer use, do not remove
public @NotNull List<UrlDetails> getRest() {
return rest;
}
public EdgeDomain getDomain() {
return first.url.getDomain();
}
public boolean hasMultiple() {
return !rest.isEmpty();
}
/** Returns the total number of results from the same domain,
* including such results that are not included here. */
public int totalCount() {
return first.resultsFromSameDomain;
}
public int remainingCount() {
return totalCount() - 1 - rest.size();
}
@Override
public int compareTo(@NotNull ClusteredUrlDetails o) {
return Objects.compare(first, o.first, UrlDetails::compareTo);
}
}

View File

@ -0,0 +1,186 @@
package nu.marginalia.search.model;
import nu.marginalia.search.command.SearchParameters;
import java.util.List;
/**
* A class to hold details about the search results,
* as used by the handlebars templating engine to render
* the search results page.
*/
public class DecoratedSearchResults {
private final SearchParameters params;
private final List<String> problems;
private final String evalResult;
public DecoratedSearchResults(SearchParameters params,
List<String> problems,
String evalResult,
List<ClusteredUrlDetails> results,
String focusDomain,
int focusDomainId,
SearchFilters filters,
List<Page> resultPages) {
this.params = params;
this.problems = problems;
this.evalResult = evalResult;
this.results = results;
this.focusDomain = focusDomain;
this.focusDomainId = focusDomainId;
this.filters = filters;
this.resultPages = resultPages;
}
public final List<ClusteredUrlDetails> results;
public static DecoratedSearchResultsBuilder builder() {
return new DecoratedSearchResultsBuilder();
}
public SearchParameters getParams() {
return params;
}
public List<String> getProblems() {
return problems;
}
public String getEvalResult() {
return evalResult;
}
public List<ClusteredUrlDetails> getResults() {
return results;
}
public String getFocusDomain() {
return focusDomain;
}
public int getFocusDomainId() {
return focusDomainId;
}
public SearchFilters getFilters() {
return filters;
}
public List<Page> getResultPages() {
return resultPages;
}
private final String focusDomain;
private final int focusDomainId;
private final SearchFilters filters;
private final List<Page> resultPages;
public boolean isMultipage() {
return resultPages.size() > 1;
}
public record Page(int number, boolean current, String href) {
}
// These are used by the search form, they look unused in the IDE but are used by the mustache template,
// DO NOT REMOVE THEM
public int getResultCount() {
return results.size();
}
public String getQuery() {
return params.query();
}
public String getProfile() {
return params.profile().filterId;
}
public String getJs() {
return params.js().value;
}
public String getAdtech() {
return params.adtech().value;
}
public String getRecent() {
return params.recent().value;
}
public String getSearchTitle() {
return params.searchTitle().value;
}
public int page() {
return params.page();
}
public Boolean isNewFilter() {
return params.newFilter();
}
public static class DecoratedSearchResultsBuilder {
private SearchParameters params;
private List<String> problems;
private String evalResult;
private List<ClusteredUrlDetails> results;
private String focusDomain;
private int focusDomainId;
private SearchFilters filters;
private List<Page> resultPages;
DecoratedSearchResultsBuilder() {
}
public DecoratedSearchResultsBuilder params(SearchParameters params) {
this.params = params;
return this;
}
public DecoratedSearchResultsBuilder problems(List<String> problems) {
this.problems = problems;
return this;
}
public DecoratedSearchResultsBuilder evalResult(String evalResult) {
this.evalResult = evalResult;
return this;
}
public DecoratedSearchResultsBuilder results(List<ClusteredUrlDetails> results) {
this.results = results;
return this;
}
public DecoratedSearchResultsBuilder focusDomain(String focusDomain) {
this.focusDomain = focusDomain;
return this;
}
public DecoratedSearchResultsBuilder focusDomainId(int focusDomainId) {
this.focusDomainId = focusDomainId;
return this;
}
public DecoratedSearchResultsBuilder filters(SearchFilters filters) {
this.filters = filters;
return this;
}
public DecoratedSearchResultsBuilder resultPages(List<Page> resultPages) {
this.resultPages = resultPages;
return this;
}
public DecoratedSearchResults build() {
return new DecoratedSearchResults(this.params, this.problems, this.evalResult, this.results, this.focusDomain, this.focusDomainId, this.filters, this.resultPages);
}
public String toString() {
return "DecoratedSearchResults.DecoratedSearchResultsBuilder(params=" + this.params + ", problems=" + this.problems + ", evalResult=" + this.evalResult + ", results=" + this.results + ", focusDomain=" + this.focusDomain + ", focusDomainId=" + this.focusDomainId + ", filters=" + this.filters + ", resultPages=" + this.resultPages + ")";
}
}
}

View File

@ -0,0 +1,223 @@
package nu.marginalia.search.model;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.*;
import java.util.List;
/** Models the search filters displayed next to the search results */
public class SearchFilters {
private final WebsiteUrl url;
public final String currentFilter;
// These are necessary for the renderer to access the data
public final RemoveJsOption removeJsOption;
public final ReduceAdtechOption reduceAdtechOption;
public final ShowRecentOption showRecentOption;
public final SearchTitleOption searchTitleOption;
public final List<List<Filter>> filterGroups;
// Getters are for the renderer to access the data
public String getCurrentFilter() {
return currentFilter;
}
public RemoveJsOption getRemoveJsOption() {
return removeJsOption;
}
public ReduceAdtechOption getReduceAdtechOption() {
return reduceAdtechOption;
}
public ShowRecentOption getShowRecentOption() {
return showRecentOption;
}
public SearchTitleOption getSearchTitleOption() {
return searchTitleOption;
}
public List<List<Filter>> getFilterGroups() {
return filterGroups;
}
public SearchFilters(WebsiteUrl url, SearchParameters parameters) {
this.url = url;
removeJsOption = new RemoveJsOption(parameters);
reduceAdtechOption = new ReduceAdtechOption(parameters);
showRecentOption = new ShowRecentOption(parameters);
searchTitleOption = new SearchTitleOption(parameters);
currentFilter = parameters.profile().filterId;
filterGroups = List.of(
List.of(
new Filter("No Filter", SearchProfile.NO_FILTER, parameters),
// new Filter("Popular", SearchProfile.POPULAR, parameters),
new Filter("Small Web", SearchProfile.SMALLWEB, parameters),
new Filter("Blogosphere", SearchProfile.BLOGOSPHERE, parameters),
new Filter("Academia", SearchProfile.ACADEMIA, parameters)
),
List.of(
new Filter("Vintage", SearchProfile.VINTAGE, parameters),
new Filter("Plain Text", SearchProfile.PLAIN_TEXT, parameters),
new Filter("~tilde", SearchProfile.TILDE, parameters)
),
List.of(
new Filter("Wiki", SearchProfile.WIKI, parameters),
new Filter("Forum", SearchProfile.FORUM, parameters),
new Filter("Docs", SearchProfile.DOCS, parameters),
new Filter("Recipes", SearchProfile.FOOD, parameters)
)
);
}
public class RemoveJsOption {
private final SearchJsParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchJsParameter.DENY_JS);
}
public String name() {
return "Remove Javascript";
}
public RemoveJsOption(SearchParameters parameters) {
this.value = parameters.js();
var toggledValue = switch (parameters.js()) {
case DENY_JS -> SearchJsParameter.DEFAULT;
default -> SearchJsParameter.DENY_JS;
};
this.url = parameters.withJs(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class ReduceAdtechOption {
private final SearchAdtechParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchAdtechParameter.REDUCE);
}
public String name() {
return "Reduce Adtech";
}
public ReduceAdtechOption(SearchParameters parameters) {
this.value = parameters.adtech();
var toggledValue = switch (parameters.adtech()) {
case REDUCE -> SearchAdtechParameter.DEFAULT;
default -> SearchAdtechParameter.REDUCE;
};
this.url = parameters.withAdtech(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class ShowRecentOption {
private final SearchRecentParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchRecentParameter.RECENT);
}
public String name() {
return "Recent Results";
}
public ShowRecentOption(SearchParameters parameters) {
this.value = parameters.recent();
var toggledValue = switch (parameters.recent()) {
case RECENT -> SearchRecentParameter.DEFAULT;
default -> SearchRecentParameter.RECENT;
};
this.url = parameters.withRecent(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class SearchTitleOption {
private final SearchTitleParameter value;
public final String url;
public String getUrl() {
return url;
}
public boolean isSet() {
return value.equals(SearchTitleParameter.TITLE);
}
public String name() {
return "Search In Title";
}
public SearchTitleOption(SearchParameters parameters) {
this.value = parameters.searchTitle();
var toggledValue = switch (parameters.searchTitle()) {
case TITLE -> SearchTitleParameter.DEFAULT;
default -> SearchTitleParameter.TITLE;
};
this.url = parameters.withTitle(toggledValue).renderUrl(SearchFilters.this.url);
}
}
public class Filter {
public final SearchProfile profile;
public final String displayName;
public final boolean current;
public final String url;
public Filter(String displayName, SearchProfile profile, SearchParameters parameters) {
this.displayName = displayName;
this.profile = profile;
this.current = profile.equals(parameters.profile());
this.url = parameters.withProfile(profile).renderUrl(SearchFilters.this.url);
}
public String getDisplayName() {
return displayName;
}
public boolean isCurrent() {
return current;
}
public String getUrl() {
return url;
}
}
}

View File

@ -0,0 +1,105 @@
package nu.marginalia.search.model;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier;
import java.util.Objects;
public enum SearchProfile {
POPULAR("default", SearchSetIdentifier.POPULAR),
SMALLWEB("modern", SearchSetIdentifier.SMALLWEB),
BLOGOSPHERE("blogosphere", SearchSetIdentifier.BLOGS),
NO_FILTER("corpo", SearchSetIdentifier.NONE),
VINTAGE("vintage", SearchSetIdentifier.NONE),
TILDE("tilde", SearchSetIdentifier.NONE),
CORPO_CLEAN("corpo-clean", SearchSetIdentifier.NONE),
ACADEMIA("academia", SearchSetIdentifier.NONE),
PLAIN_TEXT("plain-text", SearchSetIdentifier.NONE),
FOOD("food", SearchSetIdentifier.POPULAR),
FORUM("forum", SearchSetIdentifier.NONE),
WIKI("wiki", SearchSetIdentifier.NONE),
DOCS("docs", SearchSetIdentifier.NONE),
;
public final String filterId;
public final SearchSetIdentifier searchSetIdentifier;
SearchProfile(String filterId, SearchSetIdentifier searchSetIdentifier) {
this.filterId = filterId;
this.searchSetIdentifier = searchSetIdentifier;
}
private final static SearchProfile[] values = values();
public static SearchProfile getSearchProfile(String param) {
if (null == param) {
return NO_FILTER;
}
for (var profile : values) {
if (Objects.equals(profile.filterId, param)) {
return profile;
}
}
return NO_FILTER;
}
public void addTacitTerms(SearchQuery subquery) {
if (this == ACADEMIA) {
subquery.searchTermsAdvice.add("special:academia");
}
if (this == VINTAGE) {
subquery.searchTermsPriority.add("format:html123");
subquery.searchTermsPriority.add("js:false");
}
if (this == TILDE) {
subquery.searchTermsAdvice.add("special:tilde");
}
if (this == PLAIN_TEXT) {
subquery.searchTermsAdvice.add("format:plain");
}
if (this == WIKI) {
subquery.searchTermsAdvice.add("generator:wiki");
}
if (this == FORUM) {
subquery.searchTermsAdvice.add("generator:forum");
}
if (this == DOCS) {
subquery.searchTermsAdvice.add("generator:docs");
}
if (this == FOOD) {
subquery.searchTermsAdvice.add(HtmlFeature.CATEGORY_FOOD.getKeyword());
subquery.searchTermsExclude.add("special:ads");
}
}
public SpecificationLimit getYearLimit() {
if (this == SMALLWEB) {
return SpecificationLimit.greaterThan(2015);
}
if (this == VINTAGE) {
return SpecificationLimit.lessThan(2003);
}
else return SpecificationLimit.none();
}
public SpecificationLimit getSizeLimit() {
if (this == SMALLWEB) {
return SpecificationLimit.lessThan(500);
}
else return SpecificationLimit.none();
}
public SpecificationLimit getQualityLimit() {
if (this == SMALLWEB) {
return SpecificationLimit.lessThan(5);
}
else return SpecificationLimit.none();
}
}

View File

@ -0,0 +1,293 @@
package nu.marginalia.search.model;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.DomainIndexingState;
import nu.marginalia.model.crawl.HtmlFeature;
import java.util.ArrayList;
import java.util.List;
/**
* A class to hold details about a single search result.
*/
public class UrlDetails implements Comparable<UrlDetails> {
public long id;
public int domainId;
public EdgeUrl url;
public String title;
public String description;
public String format;
public int features;
public DomainIndexingState domainState;
public double termScore;
public int resultsFromSameDomain;
public String positions;
public int positionsCount;
public SearchResultItem resultItem;
public List<SearchResultKeywordScore> keywordScores;
public UrlDetails(long id, int domainId, EdgeUrl url, String title, String description, String format, int features, DomainIndexingState domainState, double termScore, int resultsFromSameDomain, String positions, int positionsCount, SearchResultItem resultItem, List<SearchResultKeywordScore> keywordScores) {
this.id = id;
this.domainId = domainId;
this.url = url;
this.title = title;
this.description = description;
this.format = format;
this.features = features;
this.domainState = domainState;
this.termScore = termScore;
this.resultsFromSameDomain = resultsFromSameDomain;
this.positions = positions;
this.positionsCount = positionsCount;
this.resultItem = resultItem;
this.keywordScores = keywordScores;
}
public UrlDetails() {
}
public boolean hasMoreResults() {
return resultsFromSameDomain > 1;
}
public String getFormat() {
if (null == format) {
return "?";
}
switch (format) {
case "HTML123":
return "HTML 1-3";
case "HTML4":
return "HTML 4";
case "XHTML":
return "XHTML";
case "HTML5":
return "HTML 5";
case "PLAIN":
return "Plain Text";
default:
return "?";
}
}
public int hashCode() {
return Long.hashCode(id);
}
@Override
public int compareTo(UrlDetails other) {
int result = Double.compare(getTermScore(), other.getTermScore());
if (result == 0) result = Long.compare(getId(), other.getId());
return result;
}
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (other == this) {
return true;
}
if (other instanceof UrlDetails) {
return ((UrlDetails) other).id == id;
}
return false;
}
public String getTitle() {
if (title == null || title.isBlank()) {
return url.toString();
}
return title;
}
public boolean isPlainText() {
return "PLAIN".equals(format);
}
public int getProblemCount() {
int mask = HtmlFeature.JS.getFeatureBit()
| HtmlFeature.COOKIES.getFeatureBit()
| HtmlFeature.TRACKING.getFeatureBit()
| HtmlFeature.AFFILIATE_LINK.getFeatureBit()
| HtmlFeature.TRACKING_ADTECH.getFeatureBit()
| HtmlFeature.ADVERTISEMENT.getFeatureBit();
return Integer.bitCount(features & mask);
}
public List<UrlProblem> getProblems() {
List<UrlProblem> problems = new ArrayList<>();
if (isScripts()) {
problems.add(new UrlProblem("Js", "The page uses Javascript"));
}
if (isCookies()) {
problems.add(new UrlProblem("Co", "The page uses Cookies"));
}
if (isTracking()) {
problems.add(new UrlProblem("Tr", "The page uses Tracking/Analytics"));
}
if (isAffiliate()) {
problems.add(new UrlProblem("Af", "The page may use Affiliate Linking"));
}
if (isAds()) {
problems.add(new UrlProblem("Ad", "The page uses Ads/Adtech Tracking"));
}
return problems;
}
public boolean isScripts() {
return HtmlFeature.hasFeature(features, HtmlFeature.JS);
}
public boolean isTracking() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING);
}
public boolean isAffiliate() {
return HtmlFeature.hasFeature(features, HtmlFeature.AFFILIATE_LINK);
}
public boolean isMedia() {
return HtmlFeature.hasFeature(features, HtmlFeature.MEDIA);
}
public boolean isCookies() {
return HtmlFeature.hasFeature(features, HtmlFeature.COOKIES);
}
public boolean isAds() {
return HtmlFeature.hasFeature(features, HtmlFeature.TRACKING_ADTECH);
}
public int getMatchRank() {
if (termScore <= 1) return 1;
if (termScore <= 2) return 2;
if (termScore <= 3) return 3;
if (termScore <= 5) return 5;
return 10;
}
public long getId() {
return this.id;
}
public int getDomainId() {
return this.domainId;
}
public EdgeUrl getUrl() {
return this.url;
}
public String getDescription() {
return this.description;
}
public int getFeatures() {
return this.features;
}
public DomainIndexingState getDomainState() {
return this.domainState;
}
public double getTermScore() {
return this.termScore;
}
public int getResultsFromSameDomain() {
return this.resultsFromSameDomain;
}
public String getPositions() {
return this.positions;
}
public int getPositionsCount() {
return this.positionsCount;
}
public SearchResultItem getResultItem() {
return this.resultItem;
}
public List<SearchResultKeywordScore> getKeywordScores() {
return this.keywordScores;
}
public UrlDetails withId(long id) {
return this.id == id ? this : new UrlDetails(id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withDomainId(int domainId) {
return this.domainId == domainId ? this : new UrlDetails(this.id, domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withUrl(EdgeUrl url) {
return this.url == url ? this : new UrlDetails(this.id, this.domainId, url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withTitle(String title) {
return this.title == title ? this : new UrlDetails(this.id, this.domainId, this.url, title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withDescription(String description) {
return this.description == description ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withFormat(String format) {
return this.format == format ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withFeatures(int features) {
return this.features == features ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withDomainState(DomainIndexingState domainState) {
return this.domainState == domainState ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withTermScore(double termScore) {
return this.termScore == termScore ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withResultsFromSameDomain(int resultsFromSameDomain) {
return this.resultsFromSameDomain == resultsFromSameDomain ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withPositions(String positions) {
return this.positions == positions ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, positions, this.positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withPositionsCount(int positionsCount) {
return this.positionsCount == positionsCount ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, positionsCount, this.resultItem, this.keywordScores);
}
public UrlDetails withResultItem(SearchResultItem resultItem) {
return this.resultItem == resultItem ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, resultItem, this.keywordScores);
}
public UrlDetails withKeywordScores(List<SearchResultKeywordScore> keywordScores) {
return this.keywordScores == keywordScores ? this : new UrlDetails(this.id, this.domainId, this.url, this.title, this.description, this.format, this.features, this.domainState, this.termScore, this.resultsFromSameDomain, this.positions, this.positionsCount, this.resultItem, keywordScores);
}
public String toString() {
return "UrlDetails(id=" + this.getId() + ", domainId=" + this.getDomainId() + ", url=" + this.getUrl() + ", title=" + this.getTitle() + ", description=" + this.getDescription() + ", format=" + this.getFormat() + ", features=" + this.getFeatures() + ", domainState=" + this.getDomainState() + ", termScore=" + this.getTermScore() + ", resultsFromSameDomain=" + this.getResultsFromSameDomain() + ", positions=" + this.getPositions() + ", positionsCount=" + this.getPositionsCount() + ", resultItem=" + this.getResultItem() + ", keywordScores=" + this.getKeywordScores() + ")";
}
public static record UrlProblem(String name, String description) {
}
}

View File

@ -0,0 +1,27 @@
package nu.marginalia.search.results;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.screenshot.ScreenshotService;
import java.util.HashSet;
import java.util.Set;
import java.util.function.Predicate;
@Singleton
public class BrowseResultCleaner {
private final ScreenshotService screenshotService;
@Inject
public BrowseResultCleaner(ScreenshotService screenshotService) {
this.screenshotService = screenshotService;
}
public Predicate<BrowseResult> shouldRemoveResultPredicateBr() {
Set<String> domainHashes = new HashSet<>(100);
return (res) -> !screenshotService.hasScreenshot(res.domainId())
|| !domainHashes.add(res.domainHash());
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.search.results;
import gnu.trove.list.TLongList;
import gnu.trove.list.array.TLongArrayList;
import gnu.trove.map.hash.TObjectIntHashMap;
import gnu.trove.set.hash.TIntHashSet;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.lsh.EasyLSH;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Objects;
public class UrlDeduplicator {
private final int LSH_SIMILARITY_THRESHOLD = 2;
private static final Logger logger = LoggerFactory.getLogger(UrlDeduplicator.class);
private final TIntHashSet seenSuperficialhashes = new TIntHashSet(200);
private final TLongList seehLSHList = new TLongArrayList(200);
private final TObjectIntHashMap<String> keyCount = new TObjectIntHashMap<>(200, 0.75f, 0);
private final int resultsPerKey;
public UrlDeduplicator(int resultsPerKey) {
this.resultsPerKey = resultsPerKey;
}
public boolean shouldRemove(DecoratedSearchResultItem details) {
if (!deduplicateOnSuperficialHash(details))
return true;
if (!deduplicateOnLSH(details))
return true;
if (!limitResultsPerDomain(details))
return true;
return false;
}
public boolean shouldRetain(DecoratedSearchResultItem details) {
return !shouldRemove(details);
}
private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) {
return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title));
}
private boolean deduplicateOnLSH(DecoratedSearchResultItem details) {
long thisHash = details.dataHash;
if (0 == thisHash)
return true;
if (seehLSHList.forEach(otherHash -> EasyLSH.hammingDistance(thisHash, otherHash) >= LSH_SIMILARITY_THRESHOLD))
{
seehLSHList.add(thisHash);
return true;
}
return false;
}
private boolean limitResultsPerDomain(DecoratedSearchResultItem details) {
final var domain = details.getUrl().getDomain();
final String key = domain.getDomainKey();
return keyCount.adjustOrPutValue(key, 1, 1) <= resultsPerKey;
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.db.DbDomainQueries;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
import java.sql.SQLException;
public class SearchAddToCrawlQueueService {
private final DbDomainQueries domainQueries;
private final WebsiteUrl websiteUrl;
private final HikariDataSource dataSource;
private final Logger logger = LoggerFactory.getLogger(SearchAddToCrawlQueueService.class);
@Inject
public SearchAddToCrawlQueueService(DbDomainQueries domainQueries,
WebsiteUrl websiteUrl,
HikariDataSource dataSource) {
this.domainQueries = domainQueries;
this.websiteUrl = websiteUrl;
this.dataSource = dataSource;
}
public Object suggestCrawling(Request request, Response response) throws SQLException {
logger.info("{}", request.queryParams());
int id = Integer.parseInt(request.queryParams("id"));
boolean nomisclick = "on".equals(request.queryParams("nomisclick"));
String domainName = getDomainName(id);
if (nomisclick) {
logger.info("Adding {} to crawl queue", domainName);
addToCrawlQueue(id);
}
else {
logger.info("Nomisclick not set, not adding {} to crawl queue", domainName);
}
response.redirect(websiteUrl.withPath("/site/" + domainName));
return "";
}
private void addToCrawlQueue(int id) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
INSERT IGNORE INTO CRAWL_QUEUE(DOMAIN_NAME, SOURCE)
SELECT DOMAIN_NAME, "user" FROM EC_DOMAIN WHERE ID=?
""")) {
stmt.setInt(1, id);
stmt.executeUpdate();
}
}
private String getDomainName(int id) {
var domain = domainQueries.getDomain(id);
if (domain.isEmpty())
Spark.halt(404);
return domain.get().toString();
}
}

View File

@ -0,0 +1,87 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.browse.DbBrowseDomainsRandom;
import nu.marginalia.browse.model.BrowseResult;
import nu.marginalia.browse.model.BrowseResultSet;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.db.DomainBlacklist;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.search.results.BrowseResultCleaner;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import static java.util.Collections.shuffle;
public class SearchBrowseService {
private final DbBrowseDomainsRandom randomDomains;
private final DbDomainQueries domainQueries;
private final DomainBlacklist blacklist;
private final DomainInfoClient domainInfoClient;
private final BrowseResultCleaner browseResultCleaner;
@Inject
public SearchBrowseService(DbBrowseDomainsRandom randomDomains,
DbDomainQueries domainQueries,
DomainBlacklist blacklist,
DomainInfoClient domainInfoClient,
BrowseResultCleaner browseResultCleaner)
{
this.randomDomains = randomDomains;
this.domainQueries = domainQueries;
this.blacklist = blacklist;
this.domainInfoClient = domainInfoClient;
this.browseResultCleaner = browseResultCleaner;
}
public BrowseResultSet getRandomEntries(int set) {
List<BrowseResult> results = randomDomains.getRandomDomains(25, blacklist, set);
results.removeIf(browseResultCleaner.shouldRemoveResultPredicateBr());
return new BrowseResultSet(results);
}
public BrowseResultSet getRelatedEntries(String domainName) throws ExecutionException, InterruptedException, TimeoutException {
var domain = domainQueries.getDomainId(new EdgeDomain(domainName));
var neighbors = domainInfoClient.similarDomains(domain, 50)
.get(100, TimeUnit.MILLISECONDS);
neighbors.removeIf(sd -> !sd.screenshot());
// If the results are very few, supplement with the alternative shitty algorithm
if (neighbors.size() < 25) {
Set<SimilarDomain> allNeighbors = new HashSet<>(neighbors);
allNeighbors.addAll(domainInfoClient
.linkedDomains(domain, 50)
.get(100, TimeUnit.MILLISECONDS)
);
neighbors.clear();
neighbors.addAll(allNeighbors);
neighbors.removeIf(sd -> !sd.screenshot());
}
List<BrowseResult> results = new ArrayList<>(neighbors.size());
for (SimilarDomain sd : neighbors) {
var resultDomain = domainQueries.getDomain(sd.domainId());
if (resultDomain.isEmpty())
continue;
results.add(new BrowseResult(resultDomain.get().toRootUrlHttp(), sd.domainId(), 0, sd.screenshot()));
}
// shuffle the items for a less repetitive experience
shuffle(neighbors);
return new BrowseResultSet(results, domainName);
}
}

View File

@ -0,0 +1,69 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.model.UrlDetails;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
public class SearchCrosstalkService {
private static final Logger logger = LoggerFactory.getLogger(SearchCrosstalkService.class);
private final SearchOperator searchOperator;
private final MustacheRenderer<CrosstalkResult> renderer;
@Inject
public SearchCrosstalkService(SearchOperator searchOperator,
RendererFactory rendererFactory) throws IOException
{
this.searchOperator = searchOperator;
this.renderer = rendererFactory.renderer("search/site-info/site-crosstalk");
}
public Object handle(Request request, Response response) throws SQLException {
String domains = request.queryParams("domains");
String[] parts = StringUtils.split(domains, ',');
if (parts.length != 2) {
throw new IllegalArgumentException("Expected exactly two domains");
}
response.type("text/html");
for (int i = 0; i < parts.length; i++) {
parts[i] = parts[i].trim();
}
var resAtoB = searchOperator.doLinkSearch(parts[0], parts[1]);
var resBtoA = searchOperator.doLinkSearch(parts[1], parts[0]);
var model = new CrosstalkResult(parts[0], parts[1], resAtoB, resBtoA);
return renderer.render(model);
}
private record CrosstalkResult(String domainA,
String domainB,
List<UrlDetails> forward,
List<UrlDetails> backward)
{
public boolean isFocusDomain() {
return true; // Hack to get the search result templates behave well
}
public boolean hasBoth() {
return !forward.isEmpty() && !backward.isEmpty();
}
}
}

View File

@ -0,0 +1,47 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.index.api.IndexMqClient;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.util.Map;
public class SearchErrorPageService {
private final IndexMqClient indexMqClient;
private final Logger logger = LoggerFactory.getLogger(getClass());
private final MustacheRenderer<Object> renderer;
@Inject
public SearchErrorPageService(IndexMqClient indexMqClient,
RendererFactory rendererFactory) throws IOException {
renderer = rendererFactory.renderer("search/error-page-search");
this.indexMqClient = indexMqClient;
}
public void serveError(Request request, Response rsp) {
rsp.body(renderError(request, "Internal error",
"""
An error occurred when communicating with the search engine index.
<p>
This is hopefully a temporary state of affairs. It may be due to
an upgrade. The index typically takes a about two or three minutes
to reload from a cold restart. Thanks for your patience.
"""));
}
private String renderError(Request request, String title, String message) {
return renderer.render(Map.of("title", title, "message", message,
"profile", request.queryParamOrDefault("profile", ""),
"js", request.queryParamOrDefault("js", ""),
"query", request.queryParamOrDefault("query", "")
));
}
}

View File

@ -0,0 +1,85 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.zaxxer.hikari.HikariDataSource;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.function.Function;
import java.util.stream.Collectors;
/** Service for handling flagging sites. This code has an admin-facing correspondent in
* DomainComplaintService in control-service
*/
public class SearchFlagSiteService {
private final HikariDataSource dataSource;
private final CategoryItem unknownCategory = new CategoryItem("unknown", "Unknown");
private final List<CategoryItem> categories =
List.of(
new CategoryItem("spam", "Spam"),
new CategoryItem("freebooting", "Reposting Stolen Content"),
new CategoryItem("broken", "Broken Website"),
new CategoryItem("shock", "Shocking/Offensive"),
new CategoryItem("blacklist", "Review Blacklisting"),
new CategoryItem("no-random", "Remove from Random Exploration")
);
private final Map<String, CategoryItem> categoryItemMap =
categories.stream().collect(Collectors.toMap(CategoryItem::categoryName, Function.identity()));
@Inject
public SearchFlagSiteService(HikariDataSource dataSource) {
this.dataSource = dataSource;
}
public List<CategoryItem> getCategories() {
return categories;
}
public List<FlagSiteComplaintModel> getExistingComplaints(int id) throws SQLException {
try (var conn = dataSource.getConnection();
var complaintsStmt = conn.prepareStatement("""
SELECT CATEGORY, FILE_DATE, REVIEWED, DECISION
FROM DOMAIN_COMPLAINT
WHERE DOMAIN_ID=?
"""))
{
List<FlagSiteComplaintModel> complaints = new ArrayList<>();
complaintsStmt.setInt(1, id);
ResultSet rs = complaintsStmt.executeQuery();
while (rs.next()) {
complaints.add(new FlagSiteComplaintModel(
categoryItemMap.getOrDefault(rs.getString(1), unknownCategory).categoryDesc,
rs.getString(2),
rs.getBoolean(3),
rs.getString(4)));
}
return complaints;
}
}
public void insertComplaint(FlagSiteFormData formData) throws SQLException {
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement(
"""
INSERT INTO DOMAIN_COMPLAINT(DOMAIN_ID, CATEGORY, DESCRIPTION, SAMPLE) VALUES (?, ?, ?, ?)
""")) {
stmt.setInt(1, formData.domainId);
stmt.setString(2, formData.category);
stmt.setString(3, formData.description);
stmt.setString(4, formData.sampleQuery);
stmt.executeUpdate();
}
}
public record CategoryItem(String categoryName, String categoryDesc) {}
public record FlagSiteComplaintModel(String category, String submitTime, boolean isReviewed, String decision) {}
public record FlagSiteFormData(int domainId, String category, String description, String sampleQuery) {}
}

View File

@ -0,0 +1,117 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.search.svc.SearchQueryCountService;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.sql.SQLException;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.ArrayList;
import java.util.List;
/** Renders the front page (index) */
@Singleton
public class SearchFrontPageService {
private final MustacheRenderer<IndexModel> template;
private final HikariDataSource dataSource;
private final SearchQueryCountService searchVisitorCount;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchFrontPageService(RendererFactory rendererFactory,
HikariDataSource dataSource,
SearchQueryCountService searchVisitorCount
) throws IOException {
this.template = rendererFactory.renderer("search/index/index");
this.dataSource = dataSource;
this.searchVisitorCount = searchVisitorCount;
}
public String render(Request request, Response response) {
response.header("Cache-control", "public,max-age=3600");
return template.render(new IndexModel(
getNewsItems(),
searchVisitorCount.getQueriesPerMinute()
));
}
private List<NewsItem> getNewsItems() {
List<NewsItem> items = new ArrayList<>();
try (var conn = dataSource.getConnection();
var stmt = conn.prepareStatement("""
SELECT TITLE, LINK, SOURCE, LIST_DATE FROM SEARCH_NEWS_FEED ORDER BY LIST_DATE DESC
""")) {
var rep = stmt.executeQuery();
while (rep.next()) {
items.add(new NewsItem(
rep.getString(1),
rep.getString(2),
rep.getString(3),
rep.getDate(4).toLocalDate()));
}
}
catch (SQLException ex) {
logger.warn("Failed to fetch news items", ex);
}
return items;
}
public Object renderNewsFeed(Request request, Response response) {
List<NewsItem> newsItems = getNewsItems();
StringBuilder sb = new StringBuilder();
sb.append("""
<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0">
<channel>
<title>Marginalia Search News and Mentions</title>
<link>https://search.marginalia.nu/</link>
<description>News and Mentions of Marginalia Search</description>
<language>en-us</language>
<ttl>60</ttl>
""");
sb.append("<lastBuildDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</lastBuildDate>\n");
sb.append("<pubDate>").append(ZonedDateTime.now().format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
sb.append("<ttl>60</ttl>\n");
for (var item : newsItems) {
sb.append("<item>\n");
sb.append("<title>").append(item.title()).append("</title>\n");
sb.append("<link>").append(item.url()).append("</link>\n");
if (item.source != null) {
sb.append("<author>").append(item.source()).append("</author>\n");
}
sb.append("<pubDate>").append(item.date().atStartOfDay().atZone(ZoneId.systemDefault()).format(DateTimeFormatter.RFC_1123_DATE_TIME)).append("</pubDate>\n");
sb.append("</item>\n");
}
sb.append("</channel>\n");
sb.append("</rss>\n");
response.type("application/rss+xml");
return sb.toString();
}
private record IndexModel(List<NewsItem> news, int searchPerMinute) { }
private record NewsItem(String title, String url, String source, LocalDate date) {}
}

View File

@ -0,0 +1,48 @@
package nu.marginalia.search.svc;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.inject.Singleton;
import java.time.temporal.ChronoUnit;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
/** Keeps per-minute statistics of queries */
@Singleton
public class SearchQueryCountService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final AtomicInteger lastMinuteQueries = new AtomicInteger();
private final TimeUnit minute = TimeUnit.of(ChronoUnit.MINUTES);
private volatile int queriesPerMinute;
public SearchQueryCountService() {
Thread updateThread = new Thread(this::updateQueriesPerMinute,
"SearchVisitorCountService::updateQueriesPerMinute");
updateThread.setDaemon(true);
updateThread.start();
}
/** Retreive the number of queries performed the minute before this one */
public int getQueriesPerMinute() {
return queriesPerMinute;
}
/** Update query statistics for presentation */
public void registerQuery() {
lastMinuteQueries.incrementAndGet();
}
private void updateQueriesPerMinute() {
try {
for (;;) {
queriesPerMinute = lastMinuteQueries.getAndSet(0);
minute.sleep(1);
}
} catch (InterruptedException e) {
logger.warn("Query counter thread was interrupted");
}
}
}

View File

@ -0,0 +1,62 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.WebsiteUrl;
import nu.marginalia.search.command.CommandEvaluator;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.exceptions.RedirectException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
public class SearchQueryService {
private final WebsiteUrl websiteUrl;
private final SearchErrorPageService errorPageService;
private final CommandEvaluator searchCommandEvaulator;
private final Logger logger = LoggerFactory.getLogger(getClass());
@Inject
public SearchQueryService(
WebsiteUrl websiteUrl,
SearchErrorPageService errorPageService,
CommandEvaluator searchCommandEvaulator) {
this.websiteUrl = websiteUrl;
this.errorPageService = errorPageService;
this.searchCommandEvaulator = searchCommandEvaulator;
}
public Object pathSearch(Request request, Response response) {
try {
return searchCommandEvaulator.eval(response, parseParameters(request));
}
catch (RedirectException ex) {
response.redirect(ex.newUrl);
}
catch (Exception ex) {
logger.error("Error", ex);
errorPageService.serveError(request, response);
}
return "";
}
private SearchParameters parseParameters(Request request) {
try {
final String queryParam = request.queryParams("query");
if (null == queryParam || queryParam.isBlank()) {
throw new RedirectException(websiteUrl.url());
}
return new SearchParameters(queryParam.trim(), request);
}
catch (Exception ex) {
// Bots keep sending bad requests, suppress the error otherwise it will
// fill up the logs.
throw new RedirectException(websiteUrl.url());
}
}
}

View File

@ -0,0 +1,416 @@
package nu.marginalia.search.svc;
import com.google.inject.Inject;
import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.feeds.FeedsClient;
import nu.marginalia.api.feeds.RpcFeed;
import nu.marginalia.api.feeds.RpcFeedItem;
import nu.marginalia.api.livecapture.LiveCaptureClient;
import nu.marginalia.db.DbDomainQueries;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.renderer.MustacheRenderer;
import nu.marginalia.renderer.RendererFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.search.SearchOperator;
import nu.marginalia.search.model.UrlDetails;
import nu.marginalia.search.svc.SearchFlagSiteService.FlagSiteFormData;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import spark.Request;
import spark.Response;
import java.io.IOException;
import java.sql.SQLException;
import java.util.List;
import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
public class SearchSiteInfoService {
private static final Logger logger = LoggerFactory.getLogger(SearchSiteInfoService.class);
private final SearchOperator searchOperator;
private final DomainInfoClient domainInfoClient;
private final SearchFlagSiteService flagSiteService;
private final DbDomainQueries domainQueries;
private final MustacheRenderer<Object> renderer;
private final FeedsClient feedsClient;
private final LiveCaptureClient liveCaptureClient;
private final ScreenshotService screenshotService;
@Inject
public SearchSiteInfoService(SearchOperator searchOperator,
DomainInfoClient domainInfoClient,
RendererFactory rendererFactory,
SearchFlagSiteService flagSiteService,
DbDomainQueries domainQueries,
FeedsClient feedsClient,
LiveCaptureClient liveCaptureClient,
ScreenshotService screenshotService) throws IOException
{
this.searchOperator = searchOperator;
this.domainInfoClient = domainInfoClient;
this.flagSiteService = flagSiteService;
this.domainQueries = domainQueries;
this.renderer = rendererFactory.renderer("search/site-info/site-info");
this.feedsClient = feedsClient;
this.liveCaptureClient = liveCaptureClient;
this.screenshotService = screenshotService;
}
public Object handle(Request request, Response response) throws SQLException {
String domainName = request.params("site");
String view = request.queryParamOrDefault("view", "info");
if (null == domainName || domainName.isBlank()) {
return null;
}
var model = switch (view) {
case "links" -> listLinks(domainName);
case "docs" -> listDocs(domainName);
case "info" -> listInfo(domainName);
case "report" -> reportSite(domainName);
default -> listInfo(domainName);
};
return renderer.render(model);
}
public Object handlePost(Request request, Response response) throws SQLException {
String domainName = request.params("site");
String view = request.queryParamOrDefault("view", "info");
if (null == domainName || domainName.isBlank()) {
return null;
}
if (!view.equals("report"))
return null;
final int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
FlagSiteFormData formData = new FlagSiteFormData(
domainId,
request.queryParams("category"),
request.queryParams("description"),
request.queryParams("sampleQuery")
);
flagSiteService.insertComplaint(formData);
var complaints = flagSiteService.getExistingComplaints(domainId);
var model = new ReportDomain(domainName, domainId, complaints, List.of(), true);
return renderer.render(model);
}
private Object reportSite(String domainName) throws SQLException {
int domainId = domainQueries.getDomainId(new EdgeDomain(domainName));
var existingComplaints = flagSiteService.getExistingComplaints(domainId);
return new ReportDomain(domainName,
domainId,
existingComplaints,
flagSiteService.getCategories(),
false);
}
private Backlinks listLinks(String domainName) {
return new Backlinks(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doBacklinkSearch(domainName));
}
private SiteInfoWithContext listInfo(String domainName) {
final int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
final Future<DomainInformation> domainInfoFuture;
final Future<List<SimilarDomain>> similarSetFuture;
final Future<List<SimilarDomain>> linkingDomainsFuture;
final CompletableFuture<RpcFeed> feedItemsFuture;
String url = "https://" + domainName + "/";
boolean hasScreenshot = screenshotService.hasScreenshot(domainId);
if (domainId < 0) {
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
similarSetFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Unknown Domain ID"));
}
else if (!domainInfoClient.isAccepting()) {
domainInfoFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
similarSetFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
linkingDomainsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
feedItemsFuture = CompletableFuture.failedFuture(new Exception("Assistant Service Unavailable"));
}
else {
domainInfoFuture = domainInfoClient.domainInformation(domainId);
similarSetFuture = domainInfoClient.similarDomains(domainId, 25);
linkingDomainsFuture = domainInfoClient.linkedDomains(domainId, 25);
feedItemsFuture = feedsClient.getFeed(domainId);
}
List<UrlDetails> sampleResults = searchOperator.doSiteSearch(domainName, domainId,5);
if (!sampleResults.isEmpty()) {
url = sampleResults.getFirst().url.withPathAndParam("/", null).toString();
}
var result = new SiteInfoWithContext(domainName,
domainId,
url,
hasScreenshot,
waitForFuture(domainInfoFuture, () -> createDummySiteInfo(domainName)),
waitForFuture(similarSetFuture, List::of),
waitForFuture(linkingDomainsFuture, List::of),
waitForFuture(feedItemsFuture.thenApply(FeedItems::new), () -> FeedItems.dummyValue(domainName)),
sampleResults
);
requestMissingScreenshots(result);
return result;
}
/** Request missing screenshots for the given site info */
private void requestMissingScreenshots(SiteInfoWithContext result) {
// Always request the main site screenshot, even if we already have it
// as this will make the live-capture do a staleness check and update
// as needed.
liveCaptureClient.requestScreengrab(result.domainId());
int requests = 1;
// Request screenshots for similar and linking domains only if they are absent
// also throttle the requests to at most 5 per view.
if (result.similar() != null) {
for (var similar : result.similar()) {
if (similar.screenshot()) {
continue;
}
if (++requests > 5) {
break;
}
liveCaptureClient.requestScreengrab(similar.domainId());
}
}
if (result.linking() != null) {
for (var linking : result.linking()) {
if (linking.screenshot()) {
continue;
}
if (++requests > 5) {
break;
}
liveCaptureClient.requestScreengrab(linking.domainId());
}
}
}
private <T> T waitForFuture(Future<T> future, Supplier<T> fallback) {
try {
return future.get(250, TimeUnit.MILLISECONDS);
} catch (Exception e) {
logger.info("Failed to get domain data: {}", e.getMessage());
return fallback.get();
}
}
private DomainInformation createDummySiteInfo(String domainName) {
return DomainInformation.builder()
.domain(new EdgeDomain(domainName))
.suggestForCrawling(true)
.unknownDomain(true)
.build();
}
private Docs listDocs(String domainName) {
int domainId = domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1);
return new Docs(domainName,
domainQueries.tryGetDomainId(new EdgeDomain(domainName)).orElse(-1),
searchOperator.doSiteSearch(domainName, domainId, 100));
}
public record Docs(Map<String, Boolean> view,
String domain,
long domainId,
List<UrlDetails> results) {
public Docs(String domain, long domainId, List<UrlDetails> results) {
this(Map.of("docs", true), domain, domainId, results);
}
public String focusDomain() { return domain; }
public String query() { return "site:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
public record Backlinks(Map<String, Boolean> view, String domain, long domainId, List<UrlDetails> results) {
public Backlinks(String domain, long domainId, List<UrlDetails> results) {
this(Map.of("links", true), domain, domainId, results);
}
public String query() { return "links:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
public record SiteInfoWithContext(Map<String, Boolean> view,
Map<String, Boolean> domainState,
String domain,
int domainId,
String siteUrl,
boolean hasScreenshot,
DomainInformation domainInformation,
List<SimilarDomain> similar,
List<SimilarDomain> linking,
FeedItems feed,
List<UrlDetails> samples
) {
public SiteInfoWithContext(String domain,
int domainId,
String siteUrl,
boolean hasScreenshot,
DomainInformation domainInformation,
List<SimilarDomain> similar,
List<SimilarDomain> linking,
FeedItems feedInfo,
List<UrlDetails> samples
)
{
this(Map.of("info", true),
Map.of(domainInfoState(domainInformation), true),
domain,
domainId,
siteUrl,
hasScreenshot,
domainInformation,
similar,
linking,
feedInfo,
samples);
}
public String getLayout() {
// My CSS is too weak to handle this in CSS alone, so I guess we're doing layout in Java...
if (similar != null && similar.size() < 25) {
return "lopsided";
}
else if (feed != null && !feed.items().isEmpty()) {
return "lopsided";
}
else if (samples != null && !samples.isEmpty()) {
return "lopsided";
}
else {
return "balanced";
}
}
public String query() { return "site:" + domain; }
private static String domainInfoState(DomainInformation info) {
if (info.isBlacklisted()) {
return "blacklisted";
}
if (!info.isUnknownDomain() && info.isSuggestForCrawling()) {
return "suggestForCrawling";
}
if (info.isInCrawlQueue()) {
return "inCrawlQueue";
}
if (info.isUnknownDomain()) {
return "unknownDomain";
}
else {
return "indexed";
}
}
public boolean isKnown() {
return domainId > 0;
}
}
public record FeedItem(String title, String date, String description, String url) {
public FeedItem(RpcFeedItem rpcFeedItem) {
this(rpcFeedItem.getTitle(),
rpcFeedItem.getDate(),
rpcFeedItem.getDescription(),
rpcFeedItem.getUrl());
}
public String pubDay() { // Extract the date from an ISO style date string
if (date.length() > 10) {
return date.substring(0, 10);
}
return date;
}
public String descriptionSafe() {
return description
.replace("<", "&lt;")
.replace(">", "&gt;");
}
}
public record FeedItems(String domain, String feedUrl, String updated, List<FeedItem> items) {
public static FeedItems dummyValue(String domain) {
return new FeedItems(domain, "", "", List.of());
}
public FeedItems(RpcFeed rpcFeedItems) {
this(rpcFeedItems.getDomain(),
rpcFeedItems.getFeedUrl(),
rpcFeedItems.getUpdated(),
rpcFeedItems.getItemsList().stream().map(FeedItem::new).toList());
}
}
public record ReportDomain(
Map<String, Boolean> view,
String domain,
int domainId,
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
List<SearchFlagSiteService.CategoryItem> category,
boolean submitted)
{
public ReportDomain(String domain,
int domainId,
List<SearchFlagSiteService.FlagSiteComplaintModel> complaints,
List<SearchFlagSiteService.CategoryItem> category,
boolean submitted) {
this(Map.of("report", true), domain, domainId, complaints, category, submitted);
}
public String query() { return "site:" + domain; }
public boolean isKnown() {
return domainId > 0;
}
}
}

View File

@ -0,0 +1,73 @@
package nu.marginalia.search.svc;
import nu.marginalia.api.math.MathClient;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.CheckForNull;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import java.util.Optional;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import java.util.function.Predicate;
import java.util.regex.Pattern;
@Singleton
public class SearchUnitConversionService {
private final Logger logger = LoggerFactory.getLogger(getClass());
private final Pattern conversionPattern = Pattern.compile("((\\d+|\\s+|[.()\\-^+%*/]|log[^a-z]|log2[^a-z]|sqrt[^a-z]|log10|cos[^a-z]|sin[^a-z]|tan[^a-z]|log2|pi[^a-z]|e[^a-z]|2pi[^a-z])+)\\s*([a-zA-Z][a-zA-Z^.0-9]*\\s?[a-zA-Z^.0-9]*)\\s+in\\s+([a-zA-Z^.0-9]+\\s?[a-zA-Z^.0-9]*)");
private final Predicate<String> evalPredicate = Pattern.compile("(\\d+|\\s+|[.()\\-^+%*/]|log|log2|sqrt|log10|cos|sin|tan|pi|e|2pi)+").asMatchPredicate();
private final MathClient mathClient;
@Inject
public SearchUnitConversionService(MathClient mathClient) {
this.mathClient = mathClient;
}
public Optional<String> tryConversion(String query) {
var matcher = conversionPattern.matcher(query);
if (!matcher.matches())
return Optional.empty();
String value = matcher.group(1);
String from = matcher.group(3);
String to = matcher.group(4);
logger.info("{} -> '{}' '{}' '{}'", query, value, from, to);
try {
var resultFuture = mathClient.unitConversion(value, from, to);
return Optional.of(
resultFuture.get(250, TimeUnit.MILLISECONDS)
);
} catch (ExecutionException e) {
logger.error("Error in unit conversion", e);
} catch (InterruptedException e) {
logger.error("Interrupted while waiting for unit conversion", e);
} catch (TimeoutException e) {
// Ignore
}
return Optional.empty();
}
public @CheckForNull Future<String> tryEval(String query) {
if (!evalPredicate.test(query)) {
return null;
}
var expr = query.toLowerCase().trim();
if (expr.chars().allMatch(Character::isDigit)) {
return null;
}
logger.info("eval({})", expr);
return mathClient.evalMath(expr);
}
}

View File

@ -0,0 +1,3 @@
# Search Service
This is the old search service that search traffic with the old GUI.

View File

@ -0,0 +1,14 @@
81.170.128.52
193.183.0.162
193.183.0.163
193.183.0.164
193.183.0.165
193.183.0.166
193.183.0.167
193.183.0.168
193.183.0.169
193.183.0.170
193.183.0.171
193.183.0.172
193.183.0.173
193.183.0.174

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

View File

@ -0,0 +1,13 @@
// This sets the data-has-js attribute on the html tag to true, so we can style the page with the assumption that
// the browser supports JS. This is a progressive enhancement, so the page will still work without JS.
document.documentElement.setAttribute('data-has-js', 'true');
// To prevent the filter menu from being opened when the user hits enter on the search box, we need to add a keydown
// handler to the search box that stops the event from propagating. Janky hack, but it works.
document.getElementById('query').addEventListener('keydown', e=> {
if (e.key === "Enter") {
const form = document.getElementById('search-form');
form.submit();
e.preventDefault();
}
});

View File

@ -0,0 +1,91 @@
function hideMenu() {
document.getElementById('filters').style.display = 'none';
}
function showMenu() {
document.getElementById('filters').style.display = 'block';
// Defer creation of the close button until the menu is opened. This is needed because the script for creating
// the filter button is run early to avoid layout shifts.
if (document.getElementById('menu-close') === null) {
registerCloseButton();
}
document.getElementById('filters').style.display = 'block';
// scroll to the top of the page so the user can see the filters
window.scrollTo({
top: 0,
left: 0,
behavior: "instant",
});
}
const registerCloseButton = () => {
// Add a button to close the filters for mobile; we do this in js to not pollute the DOM for text-only browsers
const closeButton = document.createElement('button');
closeButton.setAttribute('id', 'menu-close');
closeButton.setAttribute('title', 'Close the menu');
closeButton.setAttribute('aria-controls', '#filters');
closeButton.innerHTML = 'X';
closeButton.onclick = (event) => {
hideMenu();
event.stopPropagation();
return false;
}
document.getElementById('filters').getElementsByTagName('h2')[0].append(closeButton);
}
// Add a button to open the filters for mobile; we do this in js to not pollute the DOM for text-only browsers
const filtersButton = document.createElement('button');
filtersButton.setAttribute('id', 'mcfeast');
filtersButton.setAttribute('aria-controls', '#filters');
filtersButton.innerHTML = '&Xi;';
filtersButton.setAttribute('title', 'Open the filters menu');
filtersButton.onclick = (event) => {
showMenu();
event.stopPropagation();
return false;
}
document.getElementById('search-box').getElementsByTagName('h1')[0].append(filtersButton);
// swipe affordances for mobile
if (window.matchMedia('(pointer: coarse)').matches) {
// capture swipes to the left and right to open and close the filters
let touchStartX = 0;
let touchEndX = 0;
let touchStartY = 0;
let touchEndY = 0;
const swipeThreshold = 100;
const maxVerticalDistance = 75;
document.addEventListener('touchstart', (event) => {
touchStartX = event.changedTouches[0].screenX;
touchStartY = event.changedTouches[0].screenY;
});
document.addEventListener('touchend', (event) => {
touchEndX = event.changedTouches[0].screenX;
touchEndY = event.changedTouches[0].screenY;
let verticalDistance = Math.abs(touchStartY - touchEndY);
if (verticalDistance > maxVerticalDistance) {
return;
}
if (touchEndX - touchStartX > swipeThreshold) {
hideMenu();
event.stopPropagation();
} else if (touchStartX - touchEndX > swipeThreshold) {
showMenu();
event.stopPropagation();
}
});
// Add a floating panel to the bottom of the page to show a message when the filters are hidden
const floatingPanel = document.createElement('div');
floatingPanel.setAttribute('style', 'position: fixed; bottom: 0; left: 0; right: 0; backdrop-filter: blur(10px); padding: 0.25em; text-align: center; display: block; border-top: 1px solid #ccc; box-shadow: 0 0 -5px #eee;');
floatingPanel.innerHTML = '&larr; swipe left to open filters &larr;';
document.body.appendChild(floatingPanel);
}

View File

@ -0,0 +1,15 @@
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/"
xmlns:moz="http://www.mozilla.org/2006/browser/search/">
<!-- Thanks everyone who has told me about this :) -->
<!-- By the way, check out https://api.marginalia.nu/ if you wish to automate this,
if you try to use the endpoint below you'll probably run into trouble with cloudflare :-/
-->
<ShortName>Marginalia</ShortName>
<Description>Search Marginalia</Description>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/x-icon">https://search.marginalia.nu/favicon.ico</Image>
<Url type="text/html" method="get"
template="https://search.marginalia.nu/search?query={searchTerms}&amp;ref=opensearch"/>
<moz:SearchForm>https://search.marginalia.nu/</moz:SearchForm>
</OpenSearchDescription>

View File

@ -0,0 +1,8 @@
User-agent: *
Disallow: /browse/
Disallow: /search/
Disallow: /search
Disallow: /wiki/
Disallow: /explore/
Disallow: /site/
Disallow: /links/

View File

@ -0,0 +1,17 @@
<?xml version="1.0"?>
<!-- CC0 -->
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<svg version="1.1" id="Capa_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"
viewBox="0 0 455.731 455.731" xml:space="preserve">
<g>
<rect x="0" y="0" style="fill:#F78422;" width="455.731" height="455.731"/>
<g>
<path style="fill:#FFFFFF;" d="M296.208,159.16C234.445,97.397,152.266,63.382,64.81,63.382v64.348
c70.268,0,136.288,27.321,185.898,76.931c49.609,49.61,76.931,115.63,76.931,185.898h64.348
C391.986,303.103,357.971,220.923,296.208,159.16z"/>
<path style="fill:#FFFFFF;" d="M64.143,172.273v64.348c84.881,0,153.938,69.056,153.938,153.939h64.348
C282.429,270.196,184.507,172.273,64.143,172.273z"/>
<circle style="fill:#FFFFFF;" cx="109.833" cy="346.26" r="46.088"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 891 B

View File

@ -0,0 +1,831 @@
:root {
color-scheme: light;
--clr-bg-page: hsl(60, 42%, 95%); // $nicotine-light
--clr-bg-ui: hsl(0, 0%, 100%);
--clr-text-ui: #000; // $fg-dark
--clr-bg-theme: hsl(200, 28%, 34%); // $highlight-light
--clr-text-theme: #fff; // $fg-light
--clr-bg-highlight: hsl(0, 0%, 93%); // $highlight-light2
--clr-text-highlight: #111111;
--clr-bg-accent: hsl(63, 19%, 61%); // $nicotine-dark
--clr-border-accent: hsl(63, 19%, 35%);
--clr-border: #aaa; // $border-color2
--clr-shadow: var(--clr-border);
--clr-link: #0066cc;
--clr-link-visited: #531a89;
--clr-heading-link-visited: #fcc; // $visited
--font-family: sans-serif;
--font-size: 14px;
--font-family-heading: serif; // $heading-fonts
}
@mixin dark-theme-mixin {
color-scheme: dark;
--clr-bg-page: hsl(0, 0%, 6%);
--clr-bg-ui: hsl(0, 0%, 18%);
--clr-text-ui: #ddd;
--clr-bg-theme: hsl(0, 0%, 2%);
--clr-text-theme: var(--clr-text-ui);
--clr-bg-highlight: hsl(0, 0%, 11%);
--clr-text-highlight: #fff;
--clr-bg-accent: hsl(200, 32%, 28%);
--clr-border-accent: hsl(200, 8%, 12%);
--clr-border: hsl(0, 0%, 30%);
--clr-shadow: #000;
--clr-link: #8a8aff;
--clr-link-visited: #ffadff;
--clr-heading-link-visited: var(--clr-link-visited);
}
:root[data-theme='dark'] {
@include dark-theme-mixin;
}
// Makes theme match the user's OS preference when JS is disabled
@media (prefers-color-scheme: dark) {
:root:not([data-has-js="true"]) {
@include dark-theme-mixin;
}
}
* {
box-sizing: border-box;
}
a {
color: var(--clr-link);
}
a:visited {
color: var(--clr-link-visited);
}
input, textarea, select {
color: inherit;
}
h1 a, h2 a {
color: var(--clr-text-theme);
}
h1 a:visited, h2 a:visited {
color: var(--clr-heading-link-visited);
}
progress {
width: 10ch;
}
body {
background-color: var(--clr-bg-page);
color: var(--clr-text-ui);
font-family: var(--font-family);
font-size: var(--font-size);
line-height: 1.6;
margin-left: auto;
margin-right: auto;
max-width: 120ch;
padding: 0;
}
#frontpage {
display: grid;
grid-template-columns: 1fr auto;
grid-template-rows: auto 1fr;
grid-gap: 1ch;
align-items: start;
justify-content: start;
margin-top: 1ch;
margin-bottom: 1ch;
// named grid areas
grid-template-areas:
"frontpage-about frontpage-news"
"frontpage-tips frontpage-news";
@media (max-device-width: 624px) {
grid-template-columns: 1fr;
grid-template-rows: auto auto auto;
grid-gap: 1ch;
align-items: start;
justify-content: start;
margin-top: 1ch;
margin-bottom: 1ch;
// named grid areas
grid-template-areas:
"frontpage-about"
"frontpage-tips"
"frontpage-news";
* { max-width: unset !important; min-width: unset !important; }
}
#frontpage-news {
grid-area: frontpage-news;
max-width: 40ch;
@extend .dialog;
}
#frontpage-about {
grid-area: frontpage-about;
min-width: 40ch;
@extend .dialog;
}
#frontpage-tips {
grid-area: frontpage-tips;
min-width: 40ch;
@extend .dialog;
}
}
#siteinfo-nav {
display: block;
width: 100%;
@extend .dialog;
padding: 0.25ch !important;
margin-top: 1.5ch;
ul {
list-style: none;
padding: 0;
margin: 1ch;
li {
display: inline;
padding: 1ch;
background-color: var(--clr-bg-highlight);
a {
text-decoration: none;
display: inline-block;
color: var(--clr-text-highlight);
}
}
li.current {
background-color: var(--clr-bg-theme);
a {
color: var(--clr-text-theme);
}
}
}
}
.dialog {
border: 1px solid var(--clr-border);
box-shadow: 0 0 1ch var(--clr-shadow);
background-color: var(--clr-bg-ui);
padding: 1ch;
h2 {
margin: 0;
font-family: sans-serif;
font-weight: normal;
padding: 0.5ch;
font-size: 12pt;
background-color: var(--clr-bg-theme);
color: var(--clr-text-theme);
}
}
header {
background-color: var(--clr-bg-accent);
border: 1px solid var(--clr-border-accent);
color: var(--clr-text-ui);
box-shadow: 0 0 0.5ch var(--clr-shadow);
margin-bottom: 1ch;
display: flex;
align-items: center;
justify-content: space-between;
nav {
a {
text-decoration: none;
color: var(--clr-text-ui);
padding: .5ch;
display: inline-block;
}
a:visited {
color: var(--clr-text-ui);
}
a.extra {
background: #ccc linear-gradient(45deg,
hsl(0, 100%, 70%) 0%,
hsl(120, 100%, 70%) 50%,
hsl(240, 100%, 70%) 100%);
color: black;
text-shadow: 0 0 0.5ch #fff;
}
a:hover, a:focus {
background: var(--clr-bg-theme);
color: var(--clr-text-theme);
}
}
}
#theme {
padding: .5ch;
display: none;
[data-has-js='true'] & {
display: block;
}
}
#complaint {
@extend .dialog;
max-width: 60ch;
margin-left: auto;
margin-right: auto;
margin-top: 2ch;
textarea {
width: 100%;
height: 10ch;
}
}
#siteinfo {
margin-top: 1ch;
display: flex;
gap: 1ch;
flex-grow: 0.5;
flex-shrink: 0.5;
flex-basis: 10ch 10ch;
flex-direction: row;
flex-wrap: wrap;
align-content: stretch;
align-items: stretch;
justify-content: stretch;
#index-info, #link-info {
width: 32ch;
@extend .dialog;
}
#screenshot {
@extend .dialog;
}
#screenshot img {
width: 30ch;
height: 22.5ch;
}
}
.infobox {
h2 {
@extend .heading;
}
background-color: var(--clr-bg-ui);
padding: 1ch;
margin: 1ch;
border: 1px solid var(--clr-border);
box-shadow: 0 0 1ch var(--clr-shadow);
}
section.cards {
display: flex;
flex-direction: row;
flex-wrap: wrap;
padding-top: 1ch;
gap: 2ch;
justify-content: flex-start;
.card {
background-color: var(--clr-bg-ui);
border-left: 1px solid #ecb;
border-top: 1px solid #ecb;
box-shadow: var(--clr-shadow) 0 0 5px;
h2 {
@extend .heading;
word-break: break-word;
}
h2 a {
display: block !important;
color: inherit;
text-decoration: none;
}
a:focus img {
filter: sepia(100%);
box-shadow: #444 0px 0px 20px;
}
a:focus:not(.nofocus) {
background-color: black;
color: white;
}
.description {
padding-left: 1ch;
padding-right: 1ch;
overflow: auto;
-webkit-hyphens: auto;
-moz-hyphens: auto;
-ms-hyphens: auto;
hyphens: auto;
}
img {
width: 28ch;
height: auto;
}
.info {
padding-left: 1ch;
padding-right: 1ch;
line-height: 1.6;
}
[data-theme='dark'] & {
border: 1px solid var(--clr-border);
}
}
}
.positions {
box-shadow: 0 0 2px var(--clr-shadow);
backdrop-filter: brightness(90%);
color: var(--clr-text-highlight);
padding: 2px;
margin-right: -1ch;
margin-left: 1ch;
}
footer {
clear: both;
padding: 2ch;
margin: 16ch 0 0 0;
font-size: 12pt;
display: flex;
flex-direction: row;
flex-wrap: wrap;
justify-content: flex-start;
h1 {
font-weight: normal;
border-bottom: 4px solid var(--clr-bg-theme);
}
h2 {
font-size: 14pt;
font-weight: normal;
border-bottom: 2px solid var(--clr-bg-theme);
width: 80%;
}
section {
line-height: 1.5;
flex-basis: 40ch;
flex-grow: 1.1;
background-color: var(--clr-bg-ui);
border-left: 1px solid var(--clr-border);
box-shadow: -1px -1px 5px var(--clr-shadow);
padding-left: 1ch;
padding-right: 1ch;
margin-left: 1ch;
padding-bottom: 1ch;
margin-bottom: 1ch;
}
}
#mcfeast, #menu-close {
display: none;
}
.shadowbox {
box-shadow: 0 0 1ch var(--clr-shadow);
border: 1px solid var(--clr-border);
}
.heading {
margin: 0;
padding: 0.5ch;
background-color: var(--clr-bg-theme);
border-bottom: 1px solid var(--clr-border);
font-family: var(--font-family-heading);
font-weight: normal;
color: var(--clr-text-theme);
font-size: 12pt;
word-break: break-word;
}
.sidebar-narrow {
display: grid;
grid-template-columns: auto max-content;
grid-gap: 1ch;
align-items: start;
}
#crosstalk-view {
display: grid;
grid-template-columns: 1fr 1fr;
grid-template-rows: auto 1fr;
grid-gap: 1ch;
align-content: start;
justify-content: start;
align-items: start;
}
#similar-view {
display: grid;
grid-template-columns: 1fr 1fr;
grid-template-rows: auto 1fr;
grid-gap: 1ch;
align-content: start;
justify-content: start;
align-items: start;
table {
th {
text-align: left;
}
}
.screenshot {
width: 100%;
height: auto;
}
}
#similar-view[data-layout="lopsided"] {
#similar-info {
@extend .dialog;
grid-column: 1;
grid-row: 1 / span 2;
}
#similar-domains {
@extend .dialog;
grid-column: 2;
grid-row: 1;
}
#similar-links {
@extend .dialog;
grid-row: 2;
grid-column: 2;
}
}
#similar-view[data-layout="balanced"] {
#similar-info {
@extend .dialog;
}
#similar-domains {
grid-row: span 2;
@extend .dialog;
}
#similar-links {
@extend .dialog;
}
}
@media (max-device-width: 900px) {
#similar-view, #crosstalk-view {
display: block;
* {
margin-bottom: 1ch;
}
}
}
@media (max-device-width: 840px) {
section.cards {
display: block;
.card {
margin-bottom: 2ch;
img {
width: 100% !important;
height: auto;
}
}
}
}
#search-box {
@extend .shadowbox;
padding: 0.5ch;
background-color: var(--clr-bg-ui);
display: grid;
grid-template-columns: max-content 0 auto max-content;
grid-gap: 0.5ch;
grid-auto-rows: minmax(1ch, auto);
width: 100%;
h1 {
margin: 0;
padding: 0.5ch;
font-size: 14pt;
word-break: keep-all;
background-color: var(--clr-bg-theme);
color: var(--clr-text-theme);
font-family: var(--font-family-heading);
font-weight: normal;
text-align: center;
display: flex;
justify-content: space-between;
}
#suggestions-anchor {
margin: -0.5ch; // We need this anchor for the typeahead suggestions, but we don't want it to affect the layout
padding: 0;
}
input[type="text"] {
font-family: monospace;
font-size: 12pt;
padding: 0.5ch;
border: 1px solid var(--clr-border);
background-color: inherit;
}
input[type="submit"] {
font-size: 12pt;
border: 1px solid var(--clr-border);
background-color: var(--clr-bg-ui);
cursor: pointer;
}
// white suggesitons looks fine in dark mode
.suggestions {
background-color: #fff;
padding: .5ch;
margin-top: 5.5ch;
margin-left: 1ch;
position: absolute;
display: inline-block;
width: 300px;
border-left: 1px solid #ccc;
border-top: 1px solid #ccc;
box-shadow: 5px 5px 5px var(--clr-shadow);
z-index: 10;
a {
display: block;
color: #000;
font-size: 12pt;
font-family: 'fixedsys', monospace, serif;
text-decoration: none;
outline: none;
}
a:focus {
display: block;
background-color: #000;
color: #eee;
}
}
}
.filter-toggle-on {
a:before {
content: '';
margin-right: 1.5ch;
}
}
.filter-toggle-off {
a:before {
content: '';
margin-right: 1.5ch;
}
}
#filters {
@extend .shadowbox;
margin-top: 1ch;
background-color: var(--clr-bg-ui);
h2 {
@extend .heading;
background-color: var(--clr-bg-theme);
}
h3 {
@extend .heading;
background-color: var(--clr-bg-highlight);
color: var(--clr-text-highlight);
font-family: sans-serif;
border-bottom: 1px solid #000;
}
hr {
border-top: 0.5px solid var(--clr-border);
border-bottom: none;
}
ul {
list-style-type: none;
padding-left: 0;
li {
padding: 1ch;
a {
color: inherit;
text-decoration: none;
}
a:hover, a:focus {
border-bottom: 1px solid var(--clr-bg-theme);
}
}
li.current {
border-left: 4px solid var(--clr-bg-theme);
background-color: var(--clr-bg-highlight);
a {
margin-left: -4px;
}
}
}
}
.search-result {
@extend .shadowbox;
margin: 1ch 0 2ch 0;
.url {
background-color: var(--clr-bg-theme);
padding-left: 0.5ch;
a {
word-break: break-all;
font-family: monospace;
font-size: 8pt;
color: var(--clr-text-theme);
text-shadow: 0 0 1ch #000; // guarantee decent contrast across background colors
}
a:visited {
color: var(--clr-heading-link-visited);
}
}
h2 {
a {
word-break: break-all;
color: var(--clr-text-ui);
text-decoration: none;
}
font-size: 12pt;
@extend .heading;
background-color:var(--clr-bg-highlight);
}
.description {
background-color: var(--clr-bg-ui);
word-break: break-word;
padding: 1ch;
margin: 0;
}
ul.additional-results {
background-color: var(--clr-bg-ui);
padding: 1ch;
list-style: none;
margin: 0;
a {
color: inherit;
}
}
}
.search-result[data-ms-rank="1"] { .url, h2 { filter: grayscale(0%); } }
.search-result[data-ms-rank="2"] { .url, h2 { filter: grayscale(5%); } }
.search-result[data-ms-rank="3"] { .url, h2 { filter: grayscale(15%); } }
.search-result[data-ms-rank="4"] { .url, h2 { filter: grayscale(20%); } }
.search-result[data-ms-rank="5"] { .url, h2 { filter: grayscale(30%); } }
.search-result[data-ms-rank="10"] { .url, h2 { filter: grayscale(60%); } }
.utils {
display: flex;
font-size: 10pt;
padding: 1ch;
background-color: var(--clr-bg-highlight);
> * {
margin-right: 1ch;
margin-left: 1ch;
}
.meta {
flex-grow: 2;
text-align: right;
}
.meta > * {
padding-left: 4px;
}
a {
color: var(--clr-text-highlight);
}
}
@media (max-device-width: 624px) {
[data-has-js="true"] body { // This property is set via js so we can selectively enable these changes only if JS is enabled;
// This is desirable since mobile navigation is JS-driven. If JS is disabled, having a squished
// GUI is better than having no working UI.
margin: 0 !important;
padding: 0 0 0 0 !important;
max-width: 100%;
#suggestions-anchor { display: none; } // suggestions are not useful on mobile
.sidebar-narrow {
display: block; // fix for bizarre chrome rendering issue
}
#mcfeast {
display: inline;
float: right;
width: 2rem;
font-size: 1rem;
}
#menu-close {
float: right;
display: inline;
}
#filters {
display: none;
position: absolute;
top: 0;
left: 0;
width: 100%;
margin: 0;
padding: 0;
z-index: 100;
}
.sidebar-narrow {
grid-template-columns: auto;
}
#search-box {
grid-template-columns: auto;
}
#filters {
margin-top: 0;
}
.search-result {
margin-left: 0;
margin-right: 0;
}
}
}
.page-link {
padding-top: 0.25ch;
padding-bottom: 0.25ch;
padding-left: 0.5ch;
padding-right: 0.5ch;
margin-right: 0.5ch;
font-size: 12pt;
border: 1px solid var(--clr-border);
background-color: var(--clr-bg-highlight);
color: var(--clr-text-ui) !important;
text-decoration: none;
}
.page-link.active {
border: 1px solid var(--clr-text-ui);
background-color: var(--clr-bg-ui);
}
// The search results page is very confusing on text-based browsers, so we add a hr to separate the search results. This is
// hidden on modern browsers via CSS.
hr.w3m-helper { display: none; }
// This is a screenreader-only class that hides content from visual browsers, but allows screenreaders and
// text-based browsers to access it.
.screenreader-only {
position:absolute;
left:-10000px;
top:auto;
width:1px;
height:1px;
overflow:hidden;
}

View File

@ -0,0 +1,57 @@
function getTheme() {
const theme = window.localStorage.getItem('theme');
// if a valid theme is set in localStorage, return it
if (theme === 'dark' || theme === 'light') {
return { value: theme, system: false };
}
// if matchMedia is supported and OS theme is dark
if (window.matchMedia('(prefers-color-scheme: dark)').matches) {
return { value: 'dark', system: true };
}
return { value: 'light', system: true };
}
function setTheme(value) {
if (value === 'dark' || value === 'light') {
window.localStorage.setItem('theme', value);
} else {
window.localStorage.removeItem('theme');
}
const theme = getTheme();
document.documentElement.setAttribute('data-theme', theme.value);
}
function initializeTheme() {
const themeSelect = document.getElementById('theme-select');
const theme = getTheme();
document.documentElement.setAttribute('data-theme', theme.value);
// system is selected by default in the themeSwitcher so ignore it here
if (!theme.system) {
themeSelect.value = theme.value;
}
themeSelect.addEventListener('change', e => {
setTheme(e.target.value);
});
const mql = window.matchMedia('(prefers-color-scheme: dark)');
// if someone changes their theme at the OS level we need to update
// their theme immediately if they're using their OS theme
mql.addEventListener('change', e => {
if (themeSelect.value !== 'system') return;
if (e.matches) setTheme('dark');
else setTheme('light');
});
}
initializeTheme();

View File

@ -0,0 +1,112 @@
function setupTypeahead() {
const query = document.getElementById('query');
query.setAttribute('autocomplete', 'off');
const queryBox = document.getElementById('suggestions-anchor');
let timer = null;
function fetchSuggestions(e) {
if (timer != null) {
clearTimeout(timer);
}
timer = setTimeout(() => {
const req = new XMLHttpRequest();
req.onload = rsp => {
let items = JSON.parse(req.responseText);
const old = document.getElementById('suggestions');
if (old != null) old.remove();
if (items.length === 0) return;
const suggestions = document.createElement('div');
suggestions.setAttribute('id', 'suggestions');
suggestions.setAttribute('class', 'suggestions');
for (i=0;i<items.length;i++) {
item = document.createElement('a');
item.textContent=items[i];
item.setAttribute('href', '#')
function suggestionClickHandler(e) {
query.value = e.target.text;
query.focus();
document.getElementById('suggestions').remove();
e.preventDefault()
}
item.addEventListener('click', suggestionClickHandler);
item.addEventListener('keydown', e=> {
if (e.key === "ArrowDown") {
if (e.target.nextElementSibling != null) {
e.target.nextElementSibling.focus();
}
e.preventDefault()
}
else if (e.key === "ArrowUp") {
if (e.target.previousElementSibling != null) {
e.target.previousElementSibling.focus();
}
else {
query.focus();
}
e.preventDefault()
}
else if (e.key === "Escape") {
var suggestions = document.getElementById('suggestions');
if (suggestions != null) {
suggestions.remove();
}
query.focus();
e.preventDefault();
}
});
item.addEventListener('keypress', e=> {
if (e.key === "Enter") {
suggestionClickHandler(e);
}
});
suggestions.appendChild(item);
}
queryBox.prepend(suggestions);
}
req.open("GET", "/suggest/?partial="+encodeURIComponent(query.value));
req.send();
}, 250);
}
query.addEventListener("input", fetchSuggestions);
query.addEventListener("click", e=> {
const suggestions = document.getElementById('suggestions');
if (suggestions != null) {
suggestions.remove();
}
});
query.addEventListener("keydown", e => {
if (e.key === "ArrowDown") {
const suggestions = document.getElementById('suggestions');
if (suggestions != null) {
suggestions.childNodes[0].focus();
}
else {
fetchSuggestions(e);
}
e.preventDefault()
}
else if (e.key === "Escape") {
const suggestions = document.getElementById('suggestions');
if (suggestions != null) {
suggestions.remove();
}
query.focus();
e.preventDefault();
}
});
}
if(!window.matchMedia("(pointer: coarse)").matches) {
setupTypeahead();
}

View File

@ -0,0 +1,12 @@
<section class="card browse-result">
<h2 title="{{url.domain}}">{{displayDomain}}</h2>
<a href="{{url.proto}}://{{url.domain}}/">
<img src="/screenshot/{{domainId}}" title="{{displayDomain}} screenshot" alt="{{displayDomain}} screenshot" loading="lazy" width="400" height="300" />
</a>
<div class="utils">
<a href="/site/{{url.domain}}">Info</a>
<a href="/explore/{{url.domain}}">Similar Domains</a>
</div>
</section>

View File

@ -0,0 +1,34 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{query}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
<div class="infobox">
{{#if focusDomain}}
Showing domains similar to <tt>{{focusDomain}}</tt>.
{{/if}}
{{#unless focusDomain}}
This list of domains is random. <a href="https://search.marginalia.nu/explore/random">Refresh</a> to get
new domains, or click <b>Similar Domains</b> to
take the helm.
{{/unless}}
</div>
<section class="cards">
{{#each results}}{{>search/browse-result}}{{/each}}
</section>
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,23 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{query}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
<div class="infobox">
{{query}} = {{result}}
</div>
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,40 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{query}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
<div class="infobox">
{{#unless entries}}
No definitions were found for that word
{{/unless}}
{{#if entries}}
<ul>
{{#each entries}}
<li>{{word}}, {{type}}: {{definition}}<br></li>
{{/each}}
</ul>
{{/if}}
</div>
{{#if entries}}
<div class="infobox">
<h2>Legal</h2>
This data is derived from <a href="https://en.wiktionary.org/">wiktionary</a>,
available under GFDL and CC BY-SA 3.0. <a href="https://dumps.wikimedia.org/legal.html">More Information</a>.
</div>
{{/if}}
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,24 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{title}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
<div class="infobox">
<h2> {{ title }} </h2>
<div class="info"> {{{message}}} </div>
</div>
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,20 @@
<html lang="en-US">
<head>
<title>Error</title>
<link rel="stylesheet" href="serp.css">
<meta http-equiv="refresh" content="5">
</head>
<body>
<div class="infobox">
<h1>Error</h1>
<p>Oops! It appears the index server is <span class="headline">{{indexState}}</span>.</p>
<p>The server was probably restarted to bring online some changes. Restarting the index typically takes
a few minutes, during which searches can't be served. </p>
<p>In the event of a longer outage, the <a rel="nofollow" href="https://twitter.com/MarginaliaNu">@marginalianu</a> feed
on Twitter may have details, otherwise you can always send me an email at <tt>kontakt@marginalia.nu</tt>.</p>
<p>This page will attempt to refresh automatically every few seconds.</p>
</div>
</body>
</html>

View File

@ -0,0 +1,22 @@
<section id="frontpage-about">
<h2>About</h2>
<div class="info">
<p>This is an independent DIY search engine that focuses on non-commercial content, and attempts to
show you sites you perhaps weren't aware of in favor of the sort of sites you probably already knew
existed. </p>
<p>
The software for this search engine is all custom-built, and all crawling and indexing is
done in-house. The project is open source. Feel free to poke about in the <a
href="https://git.marginalia.nu/">source code</a> or contribute
to the development!
</p>
<p>
The search engine is currently serving about <tt>{{searchPerMinute}}</tt> queries/minute.
</p>
<p>Consider <a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">supporting the
project</a>!</p>
</div>
<div class="utils">
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">Read More</a>
</div>
</section>

View File

@ -0,0 +1,17 @@
{{#if news}}
<section id="frontpage-news">
<h2>Publicity, Discussion and Events</h2>
<div class="info">
<dl>
{{#each news}}
<dt><a href="{{url}}" rel="nofollow">{{title}}</a></dt>
<dd>{{date}} {{source}} </dd>
{{/each}}
</dl>
</div>
<div class="utils">
<a href="/news.xml">📡 RSS Feed</a>
</div>
</section>
{{/if}}

View File

@ -0,0 +1,14 @@
<section id="frontpage-tips">
<h2>Public Beta Available</h2>
<div class="info">
<p>
A redesigned version of the search engine UI is available for beta testing.
Feel free to give it a spin, feedback is welcome!
The old one will also be keep being available if you hate it,
or have compatibility issues.
</p>
<p>
<a href="https://test.marginalia.nu/">Try it out!</a>
</p>
</div>
</section>

View File

@ -0,0 +1,21 @@
<section id="frontpage-tips">
<h2>Tips</h2>
<div class="info">
<p>
This search engine isn't particularly well equipped to answering queries
posed like questions, instead try to imagine some text that might appear
in the website you are looking for, and search for that.</p>
<p>
Where this search engine really shines is finding small, old and obscure websites about some
given topic, perhaps
<a href="/search?query=commander+keen&profile=yolo&js=default">old video games</a>,
<a href="/search?query=voynich+&profile=yolo&js=default">a mystery</a>,
<a href="/search?query=augustine+confessions&profile=yolo&js=default">theology</a>,
<a href="/search?query=Hermes+Trismegistus&profile=yolo&js=default">the occult</a>,
<a href="/search?query=knitting&profile=yolo&js=default">knitting</a>,
<a href="/search?query=scc+graph+algorithm&profile=yolo&js=default">computer science</a>,
or <a href="/search?query=salvador+dali&profile=yolo&js=default">art</a>.
</p>
</div>
</section>

View File

@ -0,0 +1,31 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta property="og:description" content="search.marginalia.nu is a small independent do-it-yourself search engine for surprising but content-rich websites that never ask you to accept cookies or subscribe to newsletters. The goal is to bring you the sort of grass fed, free range HTML your grandma used to write. " />
<meta property="og:locale" content="en_US" />
<meta property="og:site_name" content="search.marginalia.nu" />
<meta property="og:type" content="website" />
<meta property="og:url" content="https://search.marginalia.nu/" />
</head>
<body>
<!-- Hi there, fellow human being :-) -->
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<section id="frontpage">
{{>search/index/index-news}}
{{>search/index/index-about}}
{{>search/index/index-redesign}}
</section>
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,46 @@
<h2>Filters</h2>
<ul>
{{#with removeJsOption}}
<li title="Exclude results with javascript"
{{#if set}}aria-checked="true" class="current"{{/if}}
{{#unless set}}aria-checked="false"{{/unless}}
role="checkbox">
<a href="{{url}}">{{name}}</a>
</li>
{{/with}}
{{#with reduceAdtechOption}}
<li title="Exclude results with tracking or likely affiliate links"
{{#if set}}aria-checked="true" class="current"{{/if}}
{{#unless set}}aria-checked="false"{{/unless}}
role="checkbox">
<a href="{{url}}">{{name}}</a>
</li>
{{/with}}
{{#with showRecentOption}}
<li title="Prefer recent results"
{{#if set}}aria-checked="true" class="current"{{/if}}
{{#unless set}}aria-checked="false"{{/unless}}
role="checkbox">
<a href="{{url}}">{{name}}</a>
</li>
{{/with}}
{{#with searchTitleOption}}
<li title="Require title match"
{{#if set}}aria-checked="true" class="current"{{/if}}
{{#unless set}}aria-checked="false"{{/unless}}
role="checkbox">
<a href="{{url}}">{{name}}</a>
</li>
{{/with}}
</ul>
<h3>Domains</h3>
<ul>
{{#each filterGroups}}
{{#each .}}
<li {{#if current}}aria-selected="true" class="current"{{/if}}><a href="{{url}}">{{displayName}}</a></li>
{{/each}}
<hr>
{{/each}}
</ul>
<!-- load this ASAP to avoid flicker -->
<script src="/menu.js"></script>

View File

@ -0,0 +1,124 @@
<footer class="onlyscreen">
<section id="tips-syntax">
<h1>Syntax</h1>
This is a keyword-based search engine. When entering multiple search terms, the search engine will
attempt to match them against documents where the terms occur in close proximity.<p>
Search terms can be excluded with a hyphen.<p>
While the search engine at present does not allow full text search, quotes can be used to
specifically search for names or terms in the title. Using quotes will also cause the search engine
to be as literal as possible in interpreting the query.<p>
Parentheses can be used to add terms to the query without giving weight to the terms when ranking
the search results.<p>
<h2>Samples</h2>
<dl class="query-samples">
<dt>soup -chicken</dt>
<dd>Look for keywords that contain <sample>soup</sample>, but not
<sample>chicken</sample>.</dd>
<dt>"keyboard"</dt>
<dd>Look for pages containing the exact word
<sample>keyboard</sample>, not <sample>keyboards</sample> or the like.</dd>
<dt>"steve mcqueen"</dt>
<dd>Look for pages containing the exact words <sample>steve mcqueen</sample>
in that order, with no words in between.</dd>
<dt>apology (plato)</dt>
<dd>Look for pages containing <sample>apology</sample> and <sample>plato</sample>, but only rank them
based on their relevance to <sample>apology</sample></dd>
</dl>
</section>
<section id="tips-keywords">
<h1>Special Keywords</h1>
Several special keywords are supported by the search engine.
<p>
<table>
<thead>
<tr><th>Keyword</th><th>Meaning</th></tr>
</thead>
<tbody>
<tr><td>site:<em>example.com</em></td><td>Display site information about <em>example.com</em></td></tr>
<tr><td>site:<em>example.com</em> <em>keyword</em></td><td>Search <em>example.com</em> for <em>keyword</em></td></tr>
<tr><td>browse:<em>example.com</em></td><td>Show similar websites to <em>example.com</em></td></tr>
<tr><td>ip:<em>127.0.0.1</em></td><td>Search documents hosted at <em>127.0.0.1</em></td></tr>
<tr><td>links:<em>example.com</em></td><td>Search documents linking to <em>example.com</em></td></tr>
<tr><td>tld:<em>edu</em> <em>keyword</em></td><td>Search documents with the top level domain <em>edu</em>.</td></tr>
<tr><td>?tld:<em>edu</em> <em>keyword</em></td><td>Prefer but do not require results with the top level domain <em>edu</em>.
This syntax is also possible for links:..., ip:... and site:...</td></tr>
<tr><td>q&gt;5</td><td>The amount of javascript and modern features is at least 5 (on a scale 0 to 25)</td></tr>
<tr><td>q&lt;5</td><td>The amount of javascript and modern features is at most 5 (on a scale 0 to 25)</td></tr>
<tr><td>year&gt;2005</td><td>(beta) The document was ostensibly published in or after 2005</td></tr>
<tr><td>year=2005</td><td>(beta) The document was ostensibly published in 2005</td></tr>
<tr><td>year&lt;2005</td><td>(beta) The document was ostensibly published in or before 2005</td></tr>
<tr><td>rank&gt;50</td><td>The ranking of the website is at least 50 in a span of 1 - 255</td></tr>
<tr><td>rank&lt;50</td><td>The ranking of the website is at most 50 in a span of 1 - 255</td></tr>
<tr><td>count&gt;10</td><td> The search term must appear in at least 10 results form the domain</td></tr>
<tr><td>count&lt;10</td><td> The search term must appear in at most 10 results from the domain</td></tr>
<tr><td>format:html5</td><td>Filter documents using the HTML5 standard. This is typically modern websites.</td></tr>
<tr><td>format:xhtml</td><td>Filter documents using the XHTML standard</td></tr>
<tr><td>format:html123</td><td>Filter documents using the HTML standards 1, 2, and 3. This is typically very old websites. </td></tr>
<tr><td>generator:wordpress</td><td>Filter documents with the specified generator, in this case wordpress</td></tr>
<tr><td>file:zip</td><td>Filter documents containing a link to a zip file (most file-endings work)</td></tr>
<tr><td>file:audio</td><td>Filter documents containing a link to an audio file</td></tr>
<tr><td>file:video</td><td>Filter documents containing a link to a video file</td></tr>
<tr><td>file:archive</td><td>Filter documents containing a link to a compressed archive</td></tr>
<tr><td>file:document</td><td>Filter documents containing a link to a document</td></tr>
<tr><td>-special:media</td><td>Filter out documents with audio or video tags</td></tr>
<tr><td>-special:scripts</td><td>Filter out documents with javascript</td></tr>
<tr><td>-special:affiliate</td><td>Filter out documents with likely Amazon affiliate links</td></tr>
<tr><td>-special:tracking</td><td>Filter out documents with analytics or tracking code</td></tr>
<tr><td>-special:cookies</td><td>Filter out documents with cookies</td></tr>
</tbody>
</table>
</section>
<section>
<h1>Results Legend</h1>
<p>
The estimated relevance of the search result is indicated using the color saturation
of the color of the search result, as well as the order the results are presented.
</p>
<p>
Information about the position of the match is indicated using a dot matrix
in the bottom bar of each search result. Each dot represents four sentences,
and are presented in an order of top-to-bottom, left-to-right.
<br><br><span class="meta positions">⣿⠃⠀⠀</span> &nbsp; &mdash; The terms occur heavily toward the beginning of the document.
<br><br><span class="meta positions">⠠⠀⡄⠁</span> &nbsp; &mdash; The terms occur sparsely throughout the document.
<br><br><span class="meta positions">⠀⠁⠀⠀</span> &nbsp; &mdash; The terms occur only in a single sentence.
</p>
<p> Potentially problems with the document are presented with a warning triangle, e.g. ⚠ 3.
Desktop users can mouse-over this to get a detailed breakdown.
</section>
<section id="legal">
<h1>Policies</h1>
This website complies with the GDPR by <em>not collecting any personal
information</em>, and with the EU Cookie Directive by <em>not using
cookies</em>. <a href="https://memex.marginalia.nu/projects/edge/privacy.gmi">More Information</a>.
<h1> Contact </h1>
Reach me at <tt><a href="mailto://kontakt@marginalia.nu">kontakt@marginalia.nu</a></tt>,
<tt><a href="https://twitter.com/MarginaliaNu">@MarginaliaNu</a></tt> on twitter.
<h1> Open Source </h1>
The search engine is open source with an AGPL license. The sources can be perused at
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
<h1>Data Sources</h1>
IP geolocation is sourced from the IP2Location LITE data available from
<a rel="external noopener nofollow" href="https://lite.ip2location.com/">https://lite.ip2location.com/</a>
under
<a rel="external noopener nofollow" href="https://creativecommons.org/licenses/by-sa/4.0/">CC-BY-SA&nbsp;4.0</a>.
</section>
</footer>
<script src="/tts.js"></script>

View File

@ -0,0 +1,18 @@
<form action="/search" method="get" id="search-form">
<div id="search-box">
<h1>
Search The Internet
</h1>
<div id="suggestions-anchor"></div>
<input {{#unless query}}autofocus{{/unless}} type="text" id="query" name="query" placeholder="Search..." value="{{query}}">
<input type="hidden" name="js" value="{{js}}">
<input type="hidden" name="adtech" value="{{adtech}}">
<input type="hidden" name="searchTitle" value="{{searchTitle}}">
<input type="hidden" name="profile" value="{{profile}}">
<input type="hidden" name="recent" value="{{recent}}">
<input type="submit" form="search-form" title="Execute Search" value="Search" autocomplete="off">
</div>
</form>
<!-- load the first stage mobile customizations script early to avoid flicker -->
<script src="/main.js"></script>

View File

@ -0,0 +1,21 @@
<a name="top"></a>
<header>
<nav>
<a href="#" class="screenreader-only" onClick="">Skip to content</a>
<a href="https://www.marginalia.nu/">Marginalia</a>
<a href="https://memex.marginalia.nu/projects/edge/about.gmi">About</a>
<a href="https://memex.marginalia.nu/projects/edge/supporting.gmi">Donate</a>
<a class="extra" href="https://search.marginalia.nu/explore/random">Random</a>
</nav>
<div id="theme">
<label for="theme-select" class="screenreader-only">Color Theme</label>
<select id="theme-select">
<option value="system" selected>System</option>
<option value="light">Light</option>
<option value="dark">Dark</option>
</select>
</div>
</header>
<!-- load this ASAP to avoid color theme flicker -->
<script src="/theme.js"></script>

View File

@ -0,0 +1,32 @@
<section data-ms-rank="{{first.matchRank}}" class="card search-result" >
{{#with first}}
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
<p class="description">{{description}}</p>
{{/with}}
<div class="utils">
Also from {{first.url.domain}}
</div>
<ul class="additional-results">
{{#each rest}}
<li><a href="{{url}}">{{title}}</a></li>
{{/each}}
</ul>
{{#with first}}
<div class="utils">
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>
<div class="meta">
{{#each problems}}
<span class="problem" title="{{description}}">{{name}}</span>
{{/each}}
<span aria-hidden="true" class="meta positions"
title="Positions where keywords were found within the document">{{positions}}</span>
</div>
</div>
{{/with}}
</section>
<hr class="w3m-helper" />

View File

@ -0,0 +1,22 @@
<!-- {{termScore}} -->
<section data-ms-rank="{{matchRank}}" class="card search-result" >
<div class="url"><a rel="nofollow external" href="{{url}}">{{url}}</a></div>
<h2> <a tabindex="-1" class="title" rel="nofollow external" href="{{url}}">{{title}}</a> </h2>
<p class="description">{{description}}</p>
<div class="utils">
{{#unless focusDomain}}
<a href="/site/{{url.domain}}" title="Domain Information">Info</a>
{{#if hasMoreResults}}<a href="/site-search/{{url.domain}}/{{query}}?profile={{profile}}" title="More results from this domain">{{resultsFromSameDomain}}+</a>{{/if}}{{/unless}}
<div class="meta">
{{#each problems}}
<span class="problem" title="{{description}}">{{name}}</span>
{{/each}}
<span aria-hidden="true" class="meta positions"
title="Positions where keywords were found within the document">{{positions}}</span>
<div class="screenreader-only">Terms appear in {{positionsCount}} positions</div>
</div>
</div>
</section>
<hr class="w3m-helper" />

View File

@ -0,0 +1,75 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{query}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body data-filter="{{filters.currentFilter}}">
{{#if newFilter}} <div class="screenreader-only" aria-role="status">Search Filters Updated</div> {{/if}}
<!-- Hi there, fellow human being :-) -->
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
<section class="sidebar-narrow">
<section id="results" class="sb-left">
{{#if focusDomain}}
<div class="infobox">
Showing search results from <a href="/site/{{focusDomain}}">{{focusDomain}}</a>.
</div>
{{/if}}
{{#unless results}}
<div class="infobox">
No search results found. If you believe this is an error, consider either
<a href="https://github.com/MarginaliaSearch/MarginaliaSearch/issues">submitting an issue on GitHub</a>,
or sending an email to <a href="mailto:kontakt@marginalia.nu">kontakt@marginalia.nu</a> describing
the problem.
</div>
{{/unless}}
{{#unless focusDomain}}
<div class="infobox screenreader-only">
Showing {{resultCount}} search results.
</div>
{{/unless}}
{{#each results}}
{{#if hasMultiple}}
{{>search/parts/search-result-rest}}
{{else}}
{{#with first}}
{{>search/parts/search-result}}
{{/with}}
{{/if}}
{{/each}}
{{#if multipage}}
<nav aria-label="pagination">
{{#each resultPages}}
<a {{#unless current}}href="{{{href}}}"{{/unless}} class="page-link {{#if current}}active{{/if}}">{{number}}</a>
{{/each}}
</nav>
{{/if}}
</section>
{{#with filters}}
<section id="filters" class="sb-right">
{{>search/parts/search-filters}}
</section>
{{/with}}
</section>
{{>search/parts/search-footer}}
</body>
</html>

View File

@ -0,0 +1,40 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{domainA}} and {{domainB}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
<div class="infobox">
Showing results containing links between <a href="/site/{{domainA}}">{{domainA}}</a> and <a href="/site/{{domainB}}">{{domainB}}</a>.
</div>
{{#each tests}}{{.}}{{/each}}
<div {{#if hasBoth}}id="crosstalk-view"{{/if}}>
<div>
{{#each forward}}
{{>search/parts/search-result}}
{{/each}}
</div>
<div>
{{#each backward}}
{{>search/parts/search-result}}
{{/each}}
</div>
</div>
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,22 @@
{{#if feed.items}}
{{#with feed}}
<h2><a title="Atom/RSS feed" target="external" href="{{feedUrl}}"><img width="16" height="16" src="/rss.svg"></a> Feed</h2>
<dl>
{{#each items}}
<dt><a href="{{url}}" rel="external noopener ugc">{{title}}</a></dt>
<dd><date>{{pubDay}}</date><br>{{{descriptionSafe}}}</dd>
{{/each}}
</dl>
{{/with}}
{{/if}}
{{#unless feed.items}}{{#if samples}}
<h2>Sample</h2>
<dl>
{{#each samples}}
<dt><a href="{{url}}" rel="external noopener ugc">{{title}}</a></dt>
<dd>{{description}}</dd>
{{/each}}
</dl>
{{/if}}{{/unless}}

View File

@ -0,0 +1,8 @@
<p>This website is <em>blacklisted</em>. This excludes it from crawling and indexing.</p>
<p>This is usually because of some form of misbehavior on the webmaster's end.
Either annoying search engine spam, or tasteless content bad faith content.</p>
<p>Occasionally this is done hastily and in error. If you would like the decision
reviewed, you may use the <a href="?v=report">report form</a> to file an appeal.</tt>
</p>

View File

@ -0,0 +1,13 @@
<fieldset>
<legend>Index</legend>
State: {{state}}<br/>
Domain ID: {{domainId}} <br/>
Node Affinity: {{nodeAffinity}} <br/>
Pages Known: {{pagesKnown}} <br/>
Pages Crawled: {{pagesFetched}} <br/>
Pages Indexed: {{pagesIndexed}} <br/>
<p></p>
IP: {{ip}} {{#if ipCountry}}<span title="{{ipCountry}}">{{getIpFlag}}</span>{{/if}}<br/>
<span title="Autonomous System">AS</span>: {{#if asn}}<a href="/search?query=as:{{asn}}&profile=corpo">{{asn}}</a> {{asnOrg}} {{asnCountry}}{{/if}} <br/>
</fieldset>
<br/>

View File

@ -0,0 +1,12 @@
<form method="POST" action="/site/suggest/">
<fieldset>
<legend>Crawling</legend>
This website is not queued for crawling. If you would like it to be crawled,
use the checkbox and button below.<p/>
<input type="hidden" name="id" value="{{domainId}}" />
<input type="checkbox" id="nomisclick" name="nomisclick" /> <label for="nomisclick"> This is not a mis-click </label>
<br/>
<br/>
<input type="submit" value="Add {{domain}} to queue" />
</fieldset>
</form>

View File

@ -0,0 +1,9 @@
<fieldset>
<legend>Crawling</legend>
This website is not known to the search engine.
To submit the website for crawling, follow <a
rel="noopener noreferrer"
target="_blank"
href="https://github.com/MarginaliaSearch/submit-site-to-marginalia-search">these instructions</a>.
</fieldset>

View File

@ -0,0 +1,23 @@
<h2>Indexing Information</h2>
{{#if domainState.blacklisted}}
{{>search/site-info/site-info-index-blacklisted}}
{{/if}}
{{#if domainState.unknownDomain}}
{{>search/site-info/site-info-index-unknown}}
{{/if}}
{{#if domainState.inCrawlQueue}}
<p>
This website is in the queue for crawling.
It may take up to a month before it is indexed.
</p>
{{/if}}
{{#if domainState.suggestForCrawling}}
{{>search/site-info/site-info-index-suggest}}
{{/if}}
{{#if domainState.indexed}}
{{>search/site-info/site-info-index-indexed}}
{{/if}}

View File

@ -0,0 +1,7 @@
<h2>Links</h2>
<fieldset>
<legend>Link Graph</legend>
Ranking: {{ranking}}%<br/>
Incoming Links: {{incomingLinks}} <br/>
Outbound Links: {{outboundLinks}} <br/>
</fieldset>

View File

@ -0,0 +1,60 @@
<section id="complaint">
{{#if submitted}}
<h2>Your complaint against {{domain}} has been submitted</h2>
<p>The review process is manual and may take a while. If urgent action is necessary,
reach me at kontakt@marginalia.nu!
</p>
{{/if}}
{{#unless submitted}}
<h2>Flag {{domain}} for review</h2>
<p>
Note, this is not intended to police acceptable thoughts or ideas.
<p>
That said, offensive content in obvious bad faith is not tolerated, especially when designed
to crop up when you didn't go looking for it. How and where it is said is more
important than what is said.
<p>
This form can also be used to appeal unfairly blacklisted sites.
<p>
<form method="POST">
<fieldset>
<legend>Flag for Review</legend>
<label for="category">Category</label><br>
<select name="category" id="category">
{{#each category}} <option value="{{categoryName}}">{{categoryDesc}}</option> {{/each}}
</select>
<br>
<br>
<label for="description">Description</label><br>
<textarea type="text" name="description" id="description" rows=4></textarea><br>
<br>
<label for="samplequery">(Optional) Search Query </label><br>
<input type="text" name="samplequery" id="samplequery" length=255 /><br>
<br>
<br/>
<input type="submit" value="File complaint" />
</fieldset>
</form>
<p>
Communicating through forms and tables is a bit impersonal,
you may also reach a human being through email at <tt>kontakt@marginalia.nu</tt>.
{{/unless}}
{{#if complaints}}
<hr>
<h2> Complaints against {{domain}} </h2>
<table border width=100%>
<tr><th>Category</th><th>Submitted</th><th>Reviewed</th></tr>
{{#each complaints}}
<tr>
<td>{{category}}</td>
<td>{{submitTime}}</td>
<td>{{#if reviewed}}&check;{{/if}}</td>
</tr>
{{/each}}
</table>
{{/if}}
</section>

View File

@ -0,0 +1,124 @@
<div class="infobox">
A <a href="/explore/{{domain}}">visual exploration</a> mode is also available.
</div>
<div id="similar-view" data-layout="{{layout}}">
<div id="similar-info">
<h2><span title="External Link">&#x1F30E;</span>&nbsp;<a rel="external noopener" href="{{siteUrl}}">{{domain}}</a></h2>
{{#if hasScreenshot}}
<a rel="external noopener" href="{{siteUrl}}">
<img class="screenshot" width="300" height="225" src="/screenshot/{{domainId}}" alt="Screenshot of {{domain}}" />
</a>
{{/if}}
{{#unless hasScreenshot}}
<p>Screenshot not yet available.</p>
{{/unless}}
{{#with domainInformation}}
{{> search/site-info/site-info-feed}}
{{> search/site-info/site-info-index}}
{{> search/site-info/site-info-links}}
{{/with}}
</div>
{{#if linking}}
<div id="similar-links">
<h2>Linking Domains</h2>
<table class="similarity-table">
<tr>
<th colspan="3">Meta</th>
<th>Rank</th>
<th>Domain</th>
<th>Similarity</th>
</tr>
{{#each linking}}
<tr>
<td>
{{#if indexed}}
{{#if active}}
<span title="Indexed">&#128064;</span>
{{/if}}
{{#unless active}}
<span title="Problem">&#128293;</span>
{{/unless}}
{{/if}}
</td>
<td>
{{#if screenshot}}&#x1f4f7;{{/if}}
</td>
<td>
{{#if linkType.isLinked}}
<span title="{{linkType.description}}"><a href="/crosstalk/?domains={{domain}},{{url.domain}}">{{{linkType}}}</a></span>
{{/if}}
</td>
<td>
<span title="{{rank}}%">{{{rankSymbols}}}</span>
</td>
<td>
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
<td>
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
</td>
</tr>
{{/each}}
</table>
</div>
{{/if}}
{{#if similar}}
<div id="similar-domains">
<h2>Similar Domains</h2>
<table class="similarity-table">
<tr>
<th colspan="3">Meta</th>
<th>Rank</th>
<th>Domain</th>
<th>Similarity</th>
</tr>
{{#each similar}}
<tr>
<td>
{{#if indexed}}
{{#if active}}
<span title="Indexed">&#128064;</span>
{{/if}}
{{#unless active}}
<span title="Problem">&#128293;</span>
{{/unless}}
{{/if}}
</td>
<td>
{{#if screenshot}}&#x1f4f7;{{/if}}
</td>
<td>
{{#if linkType.isLinked}}
<span title="{{linkType.description}}"><a href="/crosstalk/?domains={{domain}},{{url.domain}}">{{{linkType}}}</a></span>
{{/if}}
</td>
<td>
<span title="{{rank}}%">{{{rankSymbols}}}</span>
</td>
<td>
<a href="/site/{{url.domain}}?view=similar" rel="external noopener nofollow">{{url.domain}}</a></td>
<td>
<progress value="{{relatedness}}" max="100.0">{{relatedness}}</progress><br>
</td>
</tr>
{{/each}}
</table>
<p><b>Note</b>: Because two domains are considered similar does not always mean they're in
cahoots. Similarity is a measure of how often they appear in the same contexts,
which may be an association like peas and carrots, but some pairings are also defined by their
contrasting opposition, like Sparta and Athens.</p>
</div>
{{/if}}
</div>

View File

@ -0,0 +1,58 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - {{domain}}</title>
<link rel="stylesheet" href="/serp.css" />
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>
{{>search/parts/search-header}}
{{>search/parts/search-form}}
<span id="content-start"></span>
{{#with view}}
<nav id="siteinfo-nav">
<h2>{{domain}}</h2>
<ul>
<li {{#if info}}class="current"{{/if}}><a href="?view=info">Info</a></li>
<li {{#if docs}}class="current"{{/if}}>{{#if known}}<a href="?view=docs">Docs</a>{{/if}}{{#unless known}}<a class="link-unavailable" title="This domain is not known by the search engine">Docs</a>{{/unless}}</li>
<li {{#if links}}class="current"{{/if}}><a href="?view=links">Backlinks</a></li>
<li {{#if report}}class="current"{{/if}}>{{#if known}}<a href="?view=report">Report</a>{{/if}}{{#unless known}}<a class="link-unavailable" title="This domain is not known by the search engine">Report</a>{{/unless}}</li>
</ul>
</nav>
{{/with}}
{{#if view.links}}
<div class="infobox">
Showing search results with links to {{domain}}.
</div>
{{#each results}}{{>search/parts/search-result}}{{/each}}
{{/if}}
{{#if view.docs}}
<div class="infobox">
Showing documents found in {{domain}}.
</div>
{{#each results}}{{>search/parts/search-result}}{{/each}}
{{/if}}
{{#if view.report}}
{{>search/site-info/site-info-report}}
{{/if}}
{{#if view.info}}
{{>search/site-info/site-info-summary}}
{{/if}}
{{>search/parts/search-footer}}
</body>

View File

@ -0,0 +1,52 @@
package nu.marginalia.search.command.commands;
import nu.marginalia.search.command.SearchParameters;
import nu.marginalia.search.exceptions.RedirectException;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
class BangCommandTest {
public BangCommand bangCommand = new BangCommand();
@Test
public void testG() {
try {
bangCommand.process(null,
new SearchParameters(" !g test",
null, null, null, null, null, false, 1)
);
Assertions.fail("Should have thrown RedirectException");
}
catch (RedirectException ex) {
assertEquals("https://www.google.com/search?q=test", ex.newUrl);
}
}
@Test
public void testMatchPattern() {
var match = bangCommand.matchBangPattern("!g test", "!g");
assertTrue(match.isPresent());
assertEquals(match.get(), "test");
}
@Test
public void testMatchPattern2() {
var match = bangCommand.matchBangPattern("test !g", "!g");
assertTrue(match.isPresent());
assertEquals(match.get(), "test");
}
@Test
public void testMatchPattern3() {
var match = bangCommand.matchBangPattern("hello !g world", "!g");
assertTrue(match.isPresent());
assertEquals(match.get(), "hello world");
}
}

View File

@ -0,0 +1,359 @@
package nu.marginalia.search.paperdoll;
import com.google.gson.Gson;
import com.google.inject.AbstractModule;
import com.google.inject.Guice;
import com.zaxxer.hikari.HikariConfig;
import com.zaxxer.hikari.HikariDataSource;
import nu.marginalia.api.domains.DomainInfoClient;
import nu.marginalia.api.domains.model.DomainInformation;
import nu.marginalia.api.domains.model.SimilarDomain;
import nu.marginalia.api.searchquery.QueryClient;
import nu.marginalia.api.searchquery.model.query.QueryResponse;
import nu.marginalia.api.searchquery.model.query.SearchQuery;
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.screenshot.ScreenshotService;
import nu.marginalia.search.SearchModule;
import nu.marginalia.search.SearchService;
import nu.marginalia.service.ServiceId;
import nu.marginalia.service.discovery.ServiceRegistryIf;
import nu.marginalia.service.discovery.property.ServiceEndpoint;
import nu.marginalia.service.module.ServiceConfigurationModule;
import nu.marginalia.test.TestMigrationLoader;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Tag;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import org.testcontainers.containers.MariaDBContainer;
import org.testcontainers.junit.jupiter.Container;
import org.testcontainers.junit.jupiter.Testcontainers;
import spark.Spark;
import java.net.URISyntaxException;
import java.sql.SQLException;
import java.util.*;
import java.util.concurrent.CompletableFuture;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyInt;
import static org.mockito.Mockito.when;
/** This class is a special test class that sets up a search service
* and registers some search results, without actually starting the rest
* of the environment. This is used to test the search service in isolation
* when working on the frontend.
* <p></p>
* It's not actually a test, but it's in the test directory because it's
* using test related classes.
* <p></p>
* When using gradle, run ./gradlew paperDoll --info to run this test,
* the system will wait for you to kill the process to stop the test,
* and the UI is available at port 9999.
*/
@Testcontainers
@Tag("paperdoll")
public class SearchServicePaperDoll extends AbstractModule {
@Container
static MariaDBContainer<?> mariaDBContainer = new MariaDBContainer<>("mariadb")
.withDatabaseName("WMSA_prod")
.withUsername("wmsa")
.withPassword("wmsa")
.withNetworkAliases("mariadb");
private static HikariDataSource dataSource;
private static List<DecoratedSearchResultItem> results = new ArrayList<>();
private static List<SimilarDomain> dummyLinks = new ArrayList<>();
private static QueryResponse searchResponse;
private static final Gson gson = GsonFactory.get();
void registerSearchResult(
String url,
String title,
String description,
Collection<HtmlFeature> features,
double quality,
double score,
long positions)
{
try {
results.add(new DecoratedSearchResultItem(
new SearchResultItem(url.hashCode(), 2, 3, score, 0),
new EdgeUrl(url),
title,
description,
quality,
"HTML5",
HtmlFeature.encode(features),
null,
url.hashCode(),
400,
positions,
score,
4,
null)
);
}
catch (Exception e) {
throw new RuntimeException();
}
}
@BeforeAll
public static void setup() throws URISyntaxException {
if (!Boolean.getBoolean("runPaperDoll")) {
return;
}
HikariConfig config = new HikariConfig();
config.setJdbcUrl(mariaDBContainer.getJdbcUrl());
config.setUsername("wmsa");
config.setPassword("wmsa");
dataSource = new HikariDataSource(config);
TestMigrationLoader.flywayMigration(dataSource);
System.setProperty("service-name", "search");
System.setProperty("search.websiteUrl", "http://localhost:9999/");
try (var conn = dataSource.getConnection();
var newsStmt = conn.prepareStatement("""
INSERT INTO SEARCH_NEWS_FEED(TITLE, LINK, SOURCE, LIST_DATE)
VALUES (?, ?, ?, ?)
""");
var domainStmt = conn.prepareStatement("""
INSERT INTO EC_DOMAIN(ID, DOMAIN_NAME, DOMAIN_TOP, NODE_AFFINITY)
VALUES (?, ?, ?, ?)
""");
var randomStmt = conn.prepareStatement("""
INSERT INTO EC_RANDOM_DOMAINS(DOMAIN_ID, DOMAIN_SET)
VALUES (?, ?)
""")
) {
newsStmt.setString(1, "Lex Luthor elected president");
newsStmt.setString(2, "https://www.example.com/foo");
newsStmt.setString(3, "Daily Planet");
newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis()));
newsStmt.execute();
newsStmt.setString(1, "Besieged Alesian onlookers confused as Caesar builds a wall around his wall around the city walls");
newsStmt.setString(2, "https://www.example2.com/bar");
newsStmt.setString(3, "The Gaulish Observer");
newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis()));
newsStmt.execute();
newsStmt.setString(1, "Marginalia acquires Google");
newsStmt.setString(2, "https://www.example3.com/baz");
newsStmt.setString(3, "The Dependent");
newsStmt.setDate(4, new java.sql.Date(System.currentTimeMillis()));
newsStmt.execute();
domainStmt.setInt(1, 1);
domainStmt.setString(2, "www.example.com");
domainStmt.setString(3, "example.com");
domainStmt.setInt(4, 1);
domainStmt.execute();
domainStmt.setInt(1, 2);
domainStmt.setString(2, "www.example2.com");
domainStmt.setString(3, "example2.com");
domainStmt.setInt(4, 2);
domainStmt.execute();
domainStmt.setInt(1, 3);
domainStmt.setString(2, "www.example3.com");
domainStmt.setString(3, "example3.com");
domainStmt.setInt(4, 3);
domainStmt.execute();
randomStmt.setInt(1, 1);
randomStmt.setInt(2, 0);
randomStmt.execute();
randomStmt.setInt(1, 2);
randomStmt.setInt(2, 0);
randomStmt.execute();
randomStmt.setInt(1, 3);
randomStmt.setInt(2, 0);
randomStmt.execute();
} catch (SQLException e) {
e.printStackTrace();
}
searchResponse = new QueryResponse(
new SearchSpecification(new SearchQuery(), List.of(), "", "test",
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
SpecificationLimit.none(),
new QueryLimits(10, 20, 3, 4),
QueryStrategy.AUTO,
ResultRankingParameters.sensibleDefaults()
),
results,
List.of(),
List.of(),
1,
1,
null
);
}
@Test
public void run() throws Exception {
if (!Boolean.getBoolean("runPaperDoll")) {
return;
}
var injector = Guice.createInjector(
new ServiceConfigurationModule(ServiceId.Search),
new SearchModule(),
this);
injector.getInstance(SearchService.class);
List<String> suggestions = List.of("foo", "bar", "baz");
Spark.get("/suggest/", (rq, rsp) -> {
rsp.type("application/json");
return gson.toJson(suggestions);
});
Spark.get("/screenshot/*", (rq, rsp) -> {
rsp.type("image/svg+xml");
return """
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg
xmlns="http://www.w3.org/2000/svg"
width="640px"
height="480px"
viewBox="0 0 640 480"
version="1.1">
<g>
<rect
style="fill:#808080"
id="rect288"
width="595.41992"
height="430.01825"
x="23.034981"
y="27.850344" />
<text
xml:space="preserve"
style="font-size:100px;fill:#909090;font-family:sans-serif;"
x="20"
y="120">Placeholder</text>
<text
xml:space="preserve"
style="font-size:32px;fill:#000000;font-family:monospace;"
x="320" y="240" dominant-baseline="middle" text-anchor="middle">Lorem Ipsum As F</text>
</g>
</svg>
""";
});
registerSearchResult("https://www.example.com/foo", "Foo", "Lorem ipsum dolor sit amet", Set.of(), 0.5, 0.5, ~0L);
registerSearchResult("https://www.example2.com/bar", "Bar", "Some text goes here", Set.of(), 0.5, 0.5, 1L);
registerSearchResult("https://www.example3.com/baz", "All HTML Features", "This one's got every feature", EnumSet.allOf(HtmlFeature.class), 0.5, 0.5, 1L);
dummyLinks.add(new SimilarDomain(
new EdgeUrl("https://www.example.com/foo"),
1,
0.5,
0.5,
true,
true,
true,
SimilarDomain.LinkType.FOWARD
));
dummyLinks.add(new SimilarDomain(
new EdgeUrl("https://www.example2.com/foo"),
2,
0.5,
1,
false,
false,
true,
SimilarDomain.LinkType.BACKWARD
));
dummyLinks.add(new SimilarDomain(
new EdgeUrl("https://www.example3.com/foo"),
3,
0,
0.5,
false,
false,
false,
SimilarDomain.LinkType.BIDIRECTIONAL
));
for (;;);
}
public void configure() {
try {
var serviceRegistry = Mockito.mock(ServiceRegistryIf.class);
when(serviceRegistry.registerService(any(), any(), any())).thenReturn(new ServiceEndpoint("localhost", 9999));
bind(ServiceRegistryIf.class).toInstance(serviceRegistry);
bind(HikariDataSource.class).toInstance(dataSource);
var qsMock = Mockito.mock(QueryClient.class);
when(qsMock.search(any())).thenReturn(searchResponse);
bind(QueryClient.class).toInstance(qsMock);
var asMock = Mockito.mock(DomainInfoClient.class);
when(asMock.isAccepting()).thenReturn(true);
when(asMock.linkedDomains(anyInt(), anyInt())).thenReturn(CompletableFuture.completedFuture(dummyLinks));
when(asMock.similarDomains(anyInt(), anyInt())).thenReturn(CompletableFuture.completedFuture(dummyLinks));
when(asMock.domainInformation(anyInt())).thenReturn(CompletableFuture.completedFuture(
new DomainInformation(new EdgeDomain("www.example.com"),
false,
123,
123,
123,
123,
123,
1,
0.5,
false,
false,
false,
"127.0.0.1",
1,
"ACME",
"CA",
"CA",
"Exemplary")
));
bind(DomainInfoClient.class).toInstance(asMock);
var sss = Mockito.mock(ScreenshotService.class);
when(sss.hasScreenshot(anyInt())).thenReturn(true);
bind(ScreenshotService.class).toInstance(sss);
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
}

View File

@ -0,0 +1,37 @@
package nu.marginalia.util;
import nu.marginalia.LanguageModels;
import nu.marginalia.WmsaHome;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Optional;
public class TestLanguageModels {
private static final Path LANGUAGE_MODELS_DEFAULT = WmsaHome.getHomePath().resolve("model");
public static Path getLanguageModelsPath() {
final Path languageModelsHome = Optional.ofNullable(System.getenv("LANGUAGE_MODELS_HOME"))
.map(Path::of)
.orElse(LANGUAGE_MODELS_DEFAULT);
if (!Files.isDirectory(languageModelsHome)) {
throw new IllegalStateException("Could not find $LANGUAGE_MODELS_HOME, see doc/language-models.md");
}
return languageModelsHome;
}
public static LanguageModels getLanguageModels() {
var languageModelsHome = getLanguageModelsPath();
return new LanguageModels(
languageModelsHome.resolve("tfreq-new-algo3.bin"),
languageModelsHome.resolve("opennlp-sentence.bin"),
languageModelsHome.resolve("English.RDR"),
languageModelsHome.resolve("English.DICT"),
languageModelsHome.resolve("lid.176.ftz"),
languageModelsHome.resolve("segments.bin")
);
}
}

View File

@ -17,8 +17,6 @@ import java.util.List;
public class SearchService extends JoobyService {
private final WebsiteUrl websiteUrl;
private final StaticResources staticResources;
private final SearchSiteSubscriptionService siteSubscriptionService;
private static final Logger logger = LoggerFactory.getLogger(SearchService.class);
@ -48,7 +46,7 @@ public class SearchService extends JoobyService {
throws Exception {
super(params,
ServicePartition.any(),
List.of(),
List.of(), // No GRPC services
List.of(new SearchFrontPageService_(frontPageService),
new SearchQueryService_(searchQueryService),
new SearchSiteInfoService_(siteInfoService),
@ -57,9 +55,6 @@ public class SearchService extends JoobyService {
new SearchBrowseService_(searchBrowseService)
));
this.websiteUrl = websiteUrl;
this.staticResources = staticResources;
this.siteSubscriptionService = siteSubscriptionService;
}
@ -69,82 +64,6 @@ public class SearchService extends JoobyService {
jooby.get("/export-opml", siteSubscriptionService::exportOpml);
}
//
// SearchServiceMetrics.get("/search", searchQueryService::pathSearch);
// SearchServiceMetrics.get("/", frontPageService::render);
// SearchServiceMetrics.get("/news.xml", frontPageService::renderNewsFeed);
//
// SearchServiceMetrics.post("/site/suggest/", addToCrawlQueueService::suggestCrawling);
//
// SearchServiceMetrics.get("/site-search/:site/*", this::siteSearchRedir);
//
// SearchServiceMetrics.get("/site", siteInfoService::handleOverview);
// SearchServiceMetrics.get("/site/:site", siteInfoService::handle);
// SearchServiceMetrics.post("/site/:site", siteInfoService::handlePost);
//
// SearchServiceMetrics.get("/explore", searchBrowseService::handleBrowseRandom);
// SearchServiceMetrics.get("/explore/:site", searchBrowseService::handleBrowseSite);
//
// SearchServiceMetrics.get("/crosstalk/", crosstalkService::handle);
//
// SearchServiceMetrics.get("/:resource", this::serveStatic);
// Spark.exception(Exception.class, (e,p,q) -> {
// logger.error("Error during processing", e);
// wmsa_search_service_error_count.labels(p.pathInfo(), p.requestMethod()).inc();
// errorPageService.serveError(p, q);
// });
//
// // Add compression
// Spark.after((rq, rs) -> {
// rs.header("Content-Encoding", "gzip");
// });
//
// Spark.awaitInitialization();
//
//
// /** Wraps a route with a timer and a counter */
// private static class SearchServiceMetrics implements Route {
// private final Route delegatedRoute;
//
// static void get(String path, Route route) {
// Spark.get(path, new SearchServiceMetrics(route));
// }
// static void post(String path, Route route) {
// Spark.post(path, new SearchServiceMetrics(route));
// }
//
// private SearchServiceMetrics(Route delegatedRoute) {
// this.delegatedRoute = delegatedRoute;
// }
//
// @Override
// public Object handle(Request request, Response response) throws Exception {
// return wmsa_search_service_request_time
// .labels(request.matchedPath(), request.requestMethod())
// .time(() -> delegatedRoute.handle(request, response));
// }
// }
//
// private Object serveStatic(Request request, Response response) {
// String resource = request.params("resource");
// staticResources.serveStatic("search", resource, request, response);
// return "";
// }
//
// private Object siteSearchRedir(Request request, Response response) {
// final String site = request.params("site");
// final String searchTerms;
//
// if (request.splat().length == 0) searchTerms = "";
// else searchTerms = request.splat()[0];
//
// final String query = URLEncoder.encode(String.format("%s site:%s", searchTerms, site), StandardCharsets.UTF_8).trim();
// final String profile = request.queryParamOrDefault("profile", "yolo");
//
// response.redirect(websiteUrl.withPath("search?query="+query+"&profile="+profile));
//
// return "";
// }
}

View File

@ -8,6 +8,7 @@ include 'code:services-core:executor-service'
include 'code:services-core:single-service-runner'
include 'code:services-application:search-service'
include 'code:services-application:search-service-legacy'
include 'code:services-application:api-service'
include 'code:services-application:dating-service'
include 'code:services-application:explorer-service'