From 5c2f2d558f6d0130dde971471d35d0cd7b5d6331 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 31 May 2022 14:38:13 +0200 Subject: [PATCH] Update index.html for search engine (#25) Co-authored-by: vlofgren Reviewed-on: https://git.marginalia.nu/marginalia/marginalia.nu/pulls/25 --- README.md | 18 ++++---- .../configuration/server/RateLimiter.java | 2 +- .../wmsa/edge/index/EdgeIndexService.java | 42 ++++--------------- .../src/main/resources/static/edge/index.html | 17 ++++---- 4 files changed, 30 insertions(+), 49 deletions(-) diff --git a/README.md b/README.md index 2fa76c4c..cfe88bc9 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,18 @@ This is the source code for marginalia.nu, including the [search engine](https://search.marginalia.nu), the [MEMEX/gemini server](https://memex.marginalia.nu), the and the [encyclopedia service](https://encyclopedia.marginalia.nu). -The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu), -it is fine to mirror it on other hosts, but if you have issues or questions -that is where you want to go. +The aim of the project is to develop new and alternative discovery methods for the Internet. +It's an experimental workshop as much as it is a public service, the overarching goal is to +elevate the more human, non-commercial sides of the Internet. -As it stands now, the project is a bit of a mess as it wasn't developed -with the intention of going open source, a lot of tests and so on make -assumptions about the directory structure, much configuration is hard coded -and so on. Please stand by. A lot of the mess is fairly superficial. +The canonical git server for this project is [https://git.marginalia.nu](https://git.marginalia.nu). +It is fine to mirror it on other hosts, but if you have issues or questions +git.marginalia.nu is where you want to go. + +As it stands now, the project is still being set up and is a bit of a mess as +it wasn't developed with the intention of going open source, a lot of tests +and so on make assumptions about the directory structure, much configuration +is hard coded and so on. Please stand by. A lot of the mess is fairly superficial. ## Contributing diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java index 4dc4c8da..06a6131a 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/server/RateLimiter.java @@ -66,6 +66,6 @@ public class RateLimiter { private Bucket createBucket() { var refill = Refill.greedy(1, Duration.ofSeconds(refillRate)); var bw = Bandwidth.classic(capacity, refill); - return Bucket4j.builder().addLimit(bw).build(); + return Bucket.builder().addLimit(bw).build(); } } diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java index 81d57139..a04a4c83 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/index/EdgeIndexService.java @@ -331,8 +331,6 @@ public class EdgeIndexService extends Service { final Map> results = new HashMap<>(); final DomainResultCountFilter localFilter = new DomainResultCountFilter(specs.limitByDomain); - boolean debug = sq.searchTermsExclude.contains("special:debug"); - for (int i : specBuckets) { int foundResultsCount = results.values().stream().mapToInt(List::size).sum(); @@ -341,37 +339,15 @@ public class EdgeIndexService extends Service { List resultsForBucket = new ArrayList<>(specs.limitByBucket); - if (debug) { - getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) - .peek(l -> logger.info("Considering {}", Long.toHexString(l))) - .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) - .filter(ri -> { - if (seenResults.contains(ri.url.getId())) { - logger.info("Seen before: {}", Integer.toHexString(ri.url.getId())); - return false; - } - else if (!localFilter.test(i, domainCountFilter, ri)) { - logger.info("DCF: {} - {}:{}", ri.blockId, Integer.toHexString(ri.domain.getId()), Integer.toHexString(ri.url.getId())); - return false; - } - return true; - }) - .limit(specs.limitTotal * 3L) - .distinct() - .limit(Math.min(specs.limitByBucket - - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) - .forEach(resultsForBucket::add); - } - else { - getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) - .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) - .filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri)) - .limit(specs.limitTotal * 3L) - .distinct() - .limit(Math.min(specs.limitByBucket - - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) - .forEach(resultsForBucket::add); - } + getQuery(i, budget, sq.block, lv -> localFilter.filterRawValue(i, lv), searchTerms) + .mapToObj(id -> new EdgeSearchResultItem(i, sq.termSize(), id)) + .filter(ri -> !seenResults.contains(ri.url.getId()) && localFilter.test(i, domainCountFilter, ri)) + .limit(specs.limitTotal * 3L) + .distinct() + .limit(Math.min(specs.limitByBucket + - results.values().stream().mapToInt(Collection::size).sum(), limit - foundResultsCount)) + .forEach(resultsForBucket::add); + for (var result : resultsForBucket) { seenResults.add(result.url.getId()); diff --git a/marginalia_nu/src/main/resources/static/edge/index.html b/marginalia_nu/src/main/resources/static/edge/index.html index 13044a6c..166e67b8 100644 --- a/marginalia_nu/src/main/resources/static/edge/index.html +++ b/marginalia_nu/src/main/resources/static/edge/index.html @@ -61,8 +61,12 @@ existed.

The software for this search engine is all custom-built, and all crawling and indexing is - done in-house. + done in-house. The project is open source. Feel free to poke about in the source code or contribute + to the development!

+

Consider supporting the + project!

Read More @@ -98,11 +102,6 @@

Updates

-

☛ The web design of the search engine has been completely overhauled. For the most part, this should - result in even smaller page loads, and better accessibility and easier navigation, but it may still - be a bit rough in some browsers, if you do find any bugs or accessibility problems, please let me - know. You can reach me at kontakt@marginalia.nu. -

☛ The Random Mode has been overhauled, and is quite entertaining. I encourage you to give it a spin.

☛ A simple public API is now available.

@@ -116,6 +115,8 @@

Publicity, Discussion and Events

+
Marginalia Goes Open Source
+
Hacker News, 2022-05-28
You Should Check Out the Indie Web 🎞️
YouTube, You've Got Kat, 2022-03-15
@@ -137,10 +138,10 @@
Clive Thompson OneZero, 2021-09-16
- Hacker News Discussion + A search engine that favors text-heavy sites and punishes modern web design
- 2021-09-16 + Hacker News, 2021-09-16