From 964014860ab0d5dbb77d45cf7bfed119bd2d008d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 22 Mar 2023 15:11:22 +0100 Subject: [PATCH] Get suggestions working again --- .../service/module/DatabaseModule.java | 4 + .../service/module/LoggerConfiguration.java | 13 ---- .../service/module/MetricsPortProvider.java | 21 ----- .../marginalia/assistant/AssistantModule.java | 2 +- .../assistant/suggest/Suggestions.java | 4 + .../main/resources/static/search/about.html | 23 ------ .../resources/static/search/changelog.html | 23 ------ .../main/resources/static/search/error.html | 23 ------ .../resources/static/search/known-issues.html | 29 ------- .../resources/static/search/maintenance.html | 10 --- .../main/resources/static/search/notes.html | 25 ------ .../src/main/resources/static/search/tts.js | 2 +- .../resources/static/search/wiki-clean.html | 76 ------------------- .../templates/search/browse-results.hdb | 1 + .../templates/search/parts/search-footer.hdb | 3 +- .../java/nu/marginalia/api/ApiService.java | 7 +- run/nginx-site.conf | 4 + run/setup.sh | 6 ++ 18 files changed, 29 insertions(+), 247 deletions(-) delete mode 100644 code/common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java delete mode 100644 code/common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java delete mode 100644 code/services-core/search-service/src/main/resources/static/search/about.html delete mode 100644 code/services-core/search-service/src/main/resources/static/search/changelog.html delete mode 100644 code/services-core/search-service/src/main/resources/static/search/error.html delete mode 100644 code/services-core/search-service/src/main/resources/static/search/known-issues.html delete mode 100644 code/services-core/search-service/src/main/resources/static/search/maintenance.html delete mode 100644 code/services-core/search-service/src/main/resources/static/search/notes.html delete mode 100644 code/services-core/search-service/src/main/resources/static/search/wiki-clean.html diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java b/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java index d189a708..ce88599a 100644 --- a/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java +++ b/code/common/service/src/main/java/nu/marginalia/service/module/DatabaseModule.java @@ -14,6 +14,7 @@ import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.time.Duration; import java.util.Optional; import java.util.Properties; @@ -90,6 +91,9 @@ public class DatabaseModule extends AbstractModule { config.setMaximumPoolSize(100); config.setMinimumIdle(10); + + config.setMaxLifetime(Duration.ofMinutes(9).toMillis()); + return new HikariDataSource(config); } finally { diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java b/code/common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java deleted file mode 100644 index 8f651da4..00000000 --- a/code/common/service/src/main/java/nu/marginalia/service/module/LoggerConfiguration.java +++ /dev/null @@ -1,13 +0,0 @@ -package nu.marginalia.service.module; - -import com.google.inject.name.Named; - -import javax.inject.Inject; - -public class LoggerConfiguration { - @Inject - public LoggerConfiguration(@Named("service-name") String serviceName) { - System.setProperty("service-name", serviceName); - } - -} diff --git a/code/common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java b/code/common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java deleted file mode 100644 index 1f75fa9f..00000000 --- a/code/common/service/src/main/java/nu/marginalia/service/module/MetricsPortProvider.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.service.module; - -import com.google.inject.name.Named; - -import javax.inject.Inject; -import javax.inject.Provider; - -public class MetricsPortProvider implements Provider { - private final Integer servicePort; - - @Inject - public MetricsPortProvider(@Named("service-port") Integer servicePort) { - this.servicePort = servicePort; - } - - @Override - public Integer get() { - return servicePort+1000; - } - -} diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java index 0670b103..1f540fc4 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/AssistantModule.java @@ -10,7 +10,7 @@ import static com.google.inject.name.Names.named; public class AssistantModule extends AbstractModule { public void configure() { - bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("suggestions.txt")); + bind(Path.class).annotatedWith(named("suggestions-file")).toInstance(WmsaHome.getHomePath().resolve("data/suggestions.txt")); bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); } diff --git a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java index 2fd62091..0ea63f08 100644 --- a/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java +++ b/code/services-core/assistant-service/src/main/java/nu/marginalia/assistant/suggest/Suggestions.java @@ -137,6 +137,10 @@ public class Suggestions { public Stream getSuggestionsForKeyword(int count, String prefix) { var start = suggestionsTrie.select(prefix); + if (start == null) { + return Stream.empty(); + } + if (!start.getKey().startsWith(prefix)) { return Stream.empty(); } diff --git a/code/services-core/search-service/src/main/resources/static/search/about.html b/code/services-core/search-service/src/main/resources/static/search/about.html deleted file mode 100644 index dcc0b95e..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/about.html +++ /dev/null @@ -1,23 +0,0 @@ - - - - - Marginalia Search - About - - - - - -
- -
-
-

- This page has been moved to the memex. -

-
- \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/static/search/changelog.html b/code/services-core/search-service/src/main/resources/static/search/changelog.html deleted file mode 100644 index 37d3d6b7..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/changelog.html +++ /dev/null @@ -1,23 +0,0 @@ - - - - - Marginalia Search - Change Log - - - - - -
- -
-
-

- This page has been moved to the memex. -

-
- \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/static/search/error.html b/code/services-core/search-service/src/main/resources/static/search/error.html deleted file mode 100644 index 7192bf07..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/error.html +++ /dev/null @@ -1,23 +0,0 @@ - - - - - Error - - - - - -
- -
-
-

An error has occurred!

-

- Something went wrong while processing your query. Please try again later. -

-
- \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/static/search/known-issues.html b/code/services-core/search-service/src/main/resources/static/search/known-issues.html deleted file mode 100644 index e14588bc..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/known-issues.html +++ /dev/null @@ -1,29 +0,0 @@ - - - - - Marginalia Search - Known Issues - - - - -
- -
-
-

Known Issues

-
    -
  • Non-Latin text becomes horribly garbled in the summary and title description.
  • -
-

Mitigated Issues

-
    -
  • Non-latin characters are stripped from search results (Γ…lΓΆ AB becomes l AB)
  • -
  • The page doesn't look good on mobile
  • -
  • Still a few link farms getting good results
  • -
-
- \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/static/search/maintenance.html b/code/services-core/search-service/src/main/resources/static/search/maintenance.html deleted file mode 100644 index c8fdc227..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/maintenance.html +++ /dev/null @@ -1,10 +0,0 @@ - - Marginalia Search - Maintenance Notification - - -

- Down For Maintenance! -

-

The search engine is currently down for maintenance.

- To The Start Page - \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/static/search/notes.html b/code/services-core/search-service/src/main/resources/static/search/notes.html deleted file mode 100644 index c1c5f600..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/notes.html +++ /dev/null @@ -1,25 +0,0 @@ - - - - - Marginalia Search - Notes on Designing a Search Engine - - - - - - - -
- -
-
-

- This page has been moved to the memex. -

-
- \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/static/search/tts.js b/code/services-core/search-service/src/main/resources/static/search/tts.js index 586ae10e..2d07a38c 100644 --- a/code/services-core/search-service/src/main/resources/static/search/tts.js +++ b/code/services-core/search-service/src/main/resources/static/search/tts.js @@ -72,7 +72,7 @@ if(!window.matchMedia("(pointer: coarse)").matches) { document.getElementsByClassName('input')[0].appendChild(suggestions); } - req.open("GET", "https://api.marginalia.nu/suggest/?partial="+encodeURIComponent(query.value)); + req.open("GET", "/suggest/?partial="+encodeURIComponent(query.value)); req.send(); }, 250); } diff --git a/code/services-core/search-service/src/main/resources/static/search/wiki-clean.html b/code/services-core/search-service/src/main/resources/static/search/wiki-clean.html deleted file mode 100644 index a90b70b4..00000000 --- a/code/services-core/search-service/src/main/resources/static/search/wiki-clean.html +++ /dev/null @@ -1,76 +0,0 @@ - - - - - Marginalia Search - About: Easy Read Wikipedia - - - - - -
- -
-
-

About: High Readability Wikipedia

-
-

- This is a wikipedia client that strips away most links and almost all visual clutter - to provide a more book-like reading experience with fewer distractions. -

-

- This is primarily a helpful utility for a search engine focusing on similarly text-oriented - websites. -

-

- You are welcome to use it for general article reading as well. This may be useful - if you are on a low bandwidth connection, since the download size is typically reduced - from megabytes to dozens of kilobytes. -

-

- What's taken away is all the design elements that your brain would have to filter out - to read the text of the article. It seems as though overburdening this mental process - causes the reader to start scanning the text instead of reading it, which is experienced - as an inability to pay focus. -

-

- The cleaning process is not perfect and will occasionally produce strange results, - but significant problems should be relatively rare. -

- About the Search Engine - -

Limitations

-

This is a "stale" copy of wikipedia, based on an archived copy from January 2021. On the - other hand, we used to abide printed encyclopedias that didn't update at all.

-

- Be aware that the cleaning strips away a lot of information, including most references, - footnotes, quality warnings, and so forth. Refer to the original wikipedia article for - that information. -

-
-

Legal

-
- The Wikipedia text is available under the the Creative Commons Attribution-ShareAlike 3.0 license, - and so is the wikipedia text forwarded to you through this service. -
-
-

Further reading

-
Blom et al. 2017 - Comprehension and navigation of networked hypertexts
-
https://onlinelibrary.wiley.com/doi/pdf/10.1111/jcal.12243
-
-

Have something to say?

-
-

Send me an e-mail at kontakt@marginalia.nu. -

-

- Don't hesitate to let me know if the website is somehow being a nuisance, - it should respect robots.txt and reduce outgoing requests, but the format - isn't super-standardized, so occasionally it doesn't understand every directive. -

-
-
- \ No newline at end of file diff --git a/code/services-core/search-service/src/main/resources/templates/search/browse-results.hdb b/code/services-core/search-service/src/main/resources/templates/search/browse-results.hdb index 82df7343..147fcebf 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/browse-results.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/browse-results.hdb @@ -8,6 +8,7 @@ + diff --git a/code/services-core/search-service/src/main/resources/templates/search/parts/search-footer.hdb b/code/services-core/search-service/src/main/resources/templates/search/parts/search-footer.hdb index 619f0c88..ae6bdfe4 100644 --- a/code/services-core/search-service/src/main/resources/templates/search/parts/search-footer.hdb +++ b/code/services-core/search-service/src/main/resources/templates/search/parts/search-footer.hdb @@ -103,4 +103,5 @@ https://git.marginalia.nu/. - \ No newline at end of file + + \ No newline at end of file diff --git a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java index e69c64a3..47453ce4 100644 --- a/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java +++ b/code/services-satellite/api-service/src/main/java/nu/marginalia/api/ApiService.java @@ -15,6 +15,8 @@ import nu.marginalia.service.server.RateLimiter; import nu.marginalia.service.server.Service; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.slf4j.Marker; +import org.slf4j.MarkerFactory; import spark.Request; import spark.Response; import spark.Spark; @@ -30,6 +32,9 @@ public class ApiService extends Service { private final ConcurrentHashMap licenseCache = new ConcurrentHashMap<>(); private final ConcurrentHashMap rateLimiters = new ConcurrentHashMap<>(); + // Marker for filtering out sensitive content from the persistent logs + private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); + @Inject public ApiService(@Named("service-host") String ip, @Named("service-port") Integer port, @@ -78,7 +83,7 @@ public class ApiService extends Service { int count = Integer.parseInt(request.queryParamOrDefault("count", "20")); int index = Integer.parseInt(request.queryParamOrDefault("index", "3")); - logger.info("{} Search {}", license.key, args[0]); + logger.info(queryMarker, "{} Search {}", license.key, args[0]); return searchClient.query(Context.fromRequest(request), args[0], count, index) .blockingFirst().withLicense(license.getLicense()); diff --git a/run/nginx-site.conf b/run/nginx-site.conf index 45535ac0..327287b1 100644 --- a/run/nginx-site.conf +++ b/run/nginx-site.conf @@ -29,6 +29,10 @@ server { location /site/ { rewrite ^/site/(.*)$ /search?query=site:$1&profile=yolo; } + location /suggest/ { + proxy_pass http://assistant-service:5025/public$request_uri; + access_log off; + } location / { proxy_pass http://search-service:5023/public/; } diff --git a/run/setup.sh b/run/setup.sh index 38c5426b..d414892d 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -26,6 +26,12 @@ download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/t download_model data/IP2LOCATION-LITE-DB1.CSV.ZIP https://download.ip2location.com/lite/IP2LOCATION-LITE-DB1.CSV.ZIP unzip -qn -d data data/IP2LOCATION-LITE-DB1.CSV.ZIP +download_model data/adblock.txt https://downloads.marginalia.nu/data/adblock.txt +if [ ! -f data/suggestions.txt ]; then + download_model data/suggestions.txt.gz https://downloads.marginalia.nu/data/suggestions.txt.gz + gunzip data/suggestions.txt.gz +fi + if [ ! -d conf ]; then cp -r template/conf . fi