Get suggestions working again

This commit is contained in:
Viktor Lofgren 2023-03-22 15:11:22 +01:00
parent 7c58ddce81
commit 964014860a
18 changed files with 29 additions and 247 deletions

View File

@ -14,6 +14,7 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.Duration;
import java.util.Optional;
import java.util.Properties;
@ -90,6 +91,9 @@ public class DatabaseModule extends AbstractModule {
config.setMaximumPoolSize(100);
config.setMinimumIdle(10);
config.setMaxLifetime(Duration.ofMinutes(9).toMillis());
return new HikariDataSource(config);
}
finally {

View File

@ -1,13 +0,0 @@
package nu.marginalia.service.module;
import com.google.inject.name.Named;
import javax.inject.Inject;
public class LoggerConfiguration {
@Inject
public LoggerConfiguration(@Named("service-name") String serviceName) {
System.setProperty("service-name", serviceName);
}
}

View File

@ -1,21 +0,0 @@
package nu.marginalia.service.module;
import com.google.inject.name.Named;
import javax.inject.Inject;
import javax.inject.Provider;
public class MetricsPortProvider implements Provider<Integer> {
private final Integer servicePort;
@Inject
public MetricsPortProvider(@Named("service-port") Integer servicePort) {
this.servicePort = servicePort;
}
@Override
public Integer get() {
return servicePort+1000;
}
}

View File

@ -10,7 +10,7 @@ import static com.google.inject.name.Names.named;
public class AssistantModule extends AbstractModule {
public void configure() {
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("suggestions.txt"));
bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt"));
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
}

View File

@ -137,6 +137,10 @@ public class Suggestions {
public Stream<String> getSuggestionsForKeyword(int count, String prefix) {
var start = suggestionsTrie.select(prefix);
if (start == null) {
return Stream.empty();
}
if (!start.getKey().startsWith(prefix)) {
return Stream.empty();
}

View File

@ -1,23 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Marginalia Search - About</title>
<link rel="stylesheet" href="https://www.marginalia.nu/style.css" />
<link rel="stylesheet" href="https://search.marginalia.nu/style.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<header>
<nav>
<a href="/">Search</a>
<a href="/about.html">About</a>
<a class="marginalia-link" href="https://www.marginalia.nu/">Marginalia(🇸🇪)</a>
</nav>
</header>
<article>
<p>
This page has been moved to <a href="https://memex.marginalia.nu/projects/edge/about.gmi">the memex</a>.
</p>
</article>
</body>

View File

@ -1,23 +0,0 @@
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/html">
<head>
<meta charset="UTF-8">
<title>Marginalia Search - Change Log</title>
<link rel="stylesheet" href="https://www.marginalia.nu/style.css" />
<link rel="stylesheet" href="https://search.marginalia.nu/style.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<header>
<nav>
<a href="/">Search</a>
<a href="/about.html">About</a>
<a class="marginalia-link" href="https://www.marginalia.nu/">Marginalia(🇸🇪)</a>
</nav>
</header>
<article>
<p>
This page has been moved to <a href="https://memex.marginalia.nu/projects/edge/changelog.gmi">the memex</a>.
</p>
</article>
</body>

View File

@ -1,23 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Error</title>
<link rel="stylesheet" href="https://www.marginalia.nu/style.css" />
<link rel="stylesheet" href="https://search.marginalia.nu/style.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<header>
<nav>
<a href="/">Search</a>
<a href="/about.html">About</a>
</nav>
</header>
<article>
<h1>An error has occurred!</h1>
<p>
Something went wrong while processing your query. Please try again later.
</p>
</article>
</body>

View File

@ -1,29 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Marginalia Search - Known Issues</title>
<link rel="stylesheet" href="https://www.marginalia.nu/style.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<header>
<nav>
<a href="/">Search</a>
<a href="/about.html">About</a>
<a class="marginalia-link" href="https://www.marginalia.nu/">Marginalia(🇸🇪)</a>
</nav>
</header>
<article>
<h1>Known Issues</h1>
<ul>
<li>Non-Latin text becomes horribly garbled in the summary and title description.</li>
</ul>
<h2>Mitigated Issues</h2>
<ul>
<li><s>Non-latin characters are stripped from search results (<q>Ålö AB</q> becomes <q>l AB</q>)</s></li>
<li><s>The page doesn't look good on mobile</s></li>
<li><s>Still a few link farms getting good results</s></li>
</ul>
</article>
</body>

View File

@ -1,10 +0,0 @@
<html><head>
<head><title>Marginalia Search - Maintenance Notification</title></head>
</head>
<body>
<h1>
Down For Maintenance!
</h1>
<p>The search engine is currently down for maintenance.</p>
<a href="https://search.marginalia.nu/">To The Start Page</a>
</body></html>

View File

@ -1,25 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Marginalia Search - Notes on Designing a Search Engine</title>
<link rel="preload" as="font" href="https://www.marginalia.nu/fonts/Inconsolata-Medium.ttf" />
<link rel="preload" as="font" href="https://www.marginalia.nu/fonts/Roboto-Medium.ttf" />
<link rel="stylesheet" href="https://www.marginalia.nu/style.css" />
<link rel="stylesheet" href="https://search.marginalia.nu/style.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<header>
<nav>
<a href="/">Search</a>
<a href="/about.html">About</a>
<a class="marginalia-link" href="https://www.marginalia.nu/">Marginalia(🇸🇪)</a>
</nav>
</header>
<article>
<p>
This page has been moved to <a href="https://memex.marginalia.nu/projects/edge/design-notes.gmi">the memex</a>.
</p>
</article>
</body>

View File

@ -72,7 +72,7 @@ if(!window.matchMedia("(pointer: coarse)").matches) {
document.getElementsByClassName('input')[0].appendChild(suggestions);
}
req.open("GET", "https://api.marginalia.nu/suggest/?partial="+encodeURIComponent(query.value));
req.open("GET", "/suggest/?partial="+encodeURIComponent(query.value));
req.send();
}, 250);
}

View File

@ -1,76 +0,0 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Marginalia Search - About: Easy Read Wikipedia</title>
<link rel="stylesheet" href="https://www.marginalia.nu/style.css" />
<link rel="stylesheet" href="https://search.marginalia.nu/style.css" />
<meta name="viewport" content="width=device-width, initial-scale=1.0">
</head>
<body>
<header>
<nav>
<a href="/">Search</a>
<a href="/about.html">About</a>
</nav>
</header>
<article>
<h1>About: High Readability Wikipedia</h1>
<section>
<p>
This is a wikipedia client that strips away most links and almost all visual clutter
to provide a more book-like reading experience with fewer distractions.
</p>
<p>
This is primarily a helpful utility for a search engine focusing on similarly text-oriented
websites.
</p>
<p>
You are welcome to use it for general article reading as well. This may be useful
if you are on a low bandwidth connection, since the download size is typically reduced
from megabytes to dozens of kilobytes.
</p>
<p>
What's taken away is all the design elements that your brain would have to filter out
to read the text of the article. It seems as though overburdening this mental process
causes the reader to start scanning the text instead of reading it, which is experienced
as an inability to pay focus.
</p>
<p>
The cleaning process is not perfect and will occasionally produce strange results,
but significant problems should be relatively rare.
</p>
<a href="https://search.marginalia.nu/about.html">About the Search Engine</a>
<h2>Limitations</h2>
<p>This is a "stale" copy of wikipedia, based on an archived copy from January 2021. On the
other hand, we used to abide printed encyclopedias that didn't update at all. </p>
<p>
Be aware that the cleaning strips away a lot of information, including most references,
footnotes, quality warnings, and so forth. Refer to the original wikipedia article for
that information.
</p>
</section>
<h1>Legal</h1>
<section>
The Wikipedia text is available under the the Creative Commons Attribution-ShareAlike 3.0 license,
and so is the wikipedia text forwarded to you through this service.
</section>
<section>
<h2>Further reading</h2>
<dt>Blom et al. 2017 - Comprehension and navigation of networked hypertexts</dt>
<dd><a class="teknisk" href="https://onlinelibrary.wiley.com/doi/pdf/10.1111/jcal.12243">https://onlinelibrary.wiley.com/doi/pdf/10.1111/jcal.12243</a></dd>
</section>
<h1>Have something to say?</h1>
<section>
<p>Send me an e-mail at <a href="mailto:kontakt@marginalia.nu"
class="teknisk">kontakt@marginalia.nu</a>.
</p>
<p>
Don't hesitate to let me know if the website is somehow being a nuisance,
it should respect robots.txt and reduce outgoing requests, but the format
isn't super-standardized, so occasionally it doesn't understand every directive.
</p>
</section>
</article>
</body>

View File

@ -8,6 +8,7 @@
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="Marginalia">
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="robots" content="noindex" />
</head>
<body>

View File

@ -103,4 +103,5 @@
<tt><a href="https://git.marginalia.nu/">https://git.marginalia.nu/</a></tt>.
</section>
</footer>
</footer>
<script src="tts.js" rel="javascript"></script>

View File

@ -15,6 +15,8 @@ import nu.marginalia.service.server.RateLimiter;
import nu.marginalia.service.server.Service;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.slf4j.Marker;
import org.slf4j.MarkerFactory;
import spark.Request;
import spark.Response;
import spark.Spark;
@ -30,6 +32,9 @@ public class ApiService extends Service {
private final ConcurrentHashMap<String, ApiLicense> licenseCache = new ConcurrentHashMap<>();
private final ConcurrentHashMap<ApiLicense, RateLimiter> rateLimiters = new ConcurrentHashMap<>();
// Marker for filtering out sensitive content from the persistent logs
private final Marker queryMarker = MarkerFactory.getMarker("QUERY");
@Inject
public ApiService(@Named("service-host") String ip,
@Named("service-port") Integer port,
@ -78,7 +83,7 @@ public class ApiService extends Service {
int count = Integer.parseInt(request.queryParamOrDefault("count", "20"));
int index = Integer.parseInt(request.queryParamOrDefault("index", "3"));
logger.info("{} Search {}", license.key, args[0]);
logger.info(queryMarker, "{} Search {}", license.key, args[0]);
return searchClient.query(Context.fromRequest(request), args[0], count, index)
.blockingFirst().withLicense(license.getLicense());

View File

@ -29,6 +29,10 @@ server {
location /site/ {
rewrite ^/site/(.*)$ /search?query=site:$1&profile=yolo;
}
location /suggest/ {
proxy_pass http://assistant-service:5025/public$request_uri;
access_log off;
}
location / {
proxy_pass http://search-service:5023/public/;
}

View File

@ -26,6 +26,12 @@ download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/t
download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP
unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP
download_model data/adblock.txt https://downloads.marginalia.nu/data/adblock.txt
if [ ! -f data/suggestions.txt ]; then
download_model data/suggestions.txt.gz https://downloads.marginalia.nu/data/suggestions.txt.gz
gunzip data/suggestions.txt.gz
fi
if [ ! -d conf ]; then
cp -r template/conf .
fi