From 1b776b114eaf0737afa64a6e2716fc2377516e82 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 4 Mar 2023 14:00:46 +0100 Subject: [PATCH] Restructuring the git repo --- .../util/BrailleBlockPunchCards.java | 4 +- .../marginalia/client/ContextScrambler.java | 81 +++++++++++++++++++ .../index/reverse/ReverseIndexConverter.java | 2 +- libraries/array/readme.md | 26 ++++++ .../java/nu/marginalia/array/LongArray.java | 3 +- .../marginalia/array/PagingIntArrayTest.java | 17 ++++ run/nginx-site.conf | 37 +++++++++ services-core/index-service/readme.md | 10 +++ .../index/results/IndexResultValuator.java | 2 - services-core/search-service/readme.md | 16 ++++ 10 files changed, 192 insertions(+), 6 deletions(-) rename {marginalia_nu => common/model}/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java (93%) create mode 100644 common/service-client/src/main/java/nu/marginalia/client/ContextScrambler.java create mode 100644 libraries/array/readme.md create mode 100644 run/nginx-site.conf create mode 100644 services-core/index-service/readme.md create mode 100644 services-core/search-service/readme.md diff --git a/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java b/common/model/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java similarity index 93% rename from marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java rename to common/model/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java index ebf139d0..5877ae03 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java +++ b/common/model/src/main/java/nu/marginalia/util/BrailleBlockPunchCards.java @@ -4,7 +4,7 @@ public class BrailleBlockPunchCards { private static final char brailleBlockBase = '\u2800'; - public static String printBits(int val, int bits) { + public static String printBits(long val, int bits) { StringBuilder builder = new StringBuilder(); for (int b = 0; b < bits; b+=8, val>>>=8) { @@ -48,7 +48,7 @@ public class BrailleBlockPunchCards { * Thanks for coming to my TED talk. */ - private static char bin2brail(int v) { + private static char bin2brail(long v) { return (char)((v & 0x87) | ((v & 0x70) >> 1) | ((v & 0x08) << 3)); } } diff --git a/common/service-client/src/main/java/nu/marginalia/client/ContextScrambler.java b/common/service-client/src/main/java/nu/marginalia/client/ContextScrambler.java new file mode 100644 index 00000000..b1faf393 --- /dev/null +++ b/common/service-client/src/main/java/nu/marginalia/client/ContextScrambler.java @@ -0,0 +1,81 @@ +package nu.marginalia.client; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import io.reactivex.rxjava3.schedulers.Schedulers; + +import java.util.Arrays; +import java.util.Objects; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +public class ContextScrambler { + private static final Random random; + private static final HashFunction hf = Hashing.sha512(); + private static volatile byte[] seed = new byte[12]; + + static { + random = new Random(); + int gr = random.nextInt(10000, 20000); + for (int i = 0; i < gr; i++) { + random.nextLong(); + } + random.nextBytes(seed); + + updateSalt(); + } + + /** Anonymize the string by running it through a hash function + * together with a salt that is rotated at random intervals. + *

+ * This is probably not cryptographically secure, but should at least + * be fairly annoying to reverse-engineer. + */ + public static String anonymize(String connectionInfo) { + byte[] hashData = Arrays.copyOf(seed, seed.length+4); + int hashi = Objects.hash(connectionInfo.split("-", 2)[0]); + + for (int i = 0; i < 4; i++) { + hashData[seed.length] = (byte)(hashi & 0xFF); + hashData[seed.length+1] = (byte)(hashi>>>8 & 0xFF); + hashData[seed.length+2] = (byte)(hashi>>>16 & 0xFF); + hashData[seed.length+3] = (byte)(hashi>>>24 & 0xFF); + } + + return String.format("#%x", hf.hashBytes(hashData).asInt()); + } + + /** Generate a humongous salt with as many moving parts as possible, + * as creating a rainbow table of all IP-addresses is fairly easy + */ + private static byte[] generateSalt() { + byte[] oldHash = seed; + + int hash1 = random.nextInt(); + int hash2 = hf.hashLong(System.nanoTime()).asInt(); + int hash3 = hf.hashBytes(oldHash).asInt(); + + return new byte[]{ + (byte) (hash1 & 0xFF), + (byte) (hash1 >>> 8 & 0xFF), + (byte) (hash1 >>> 16 & 0xFF), + (byte) (hash1 >>> 24 & 0xFF), + (byte) (hash2 & 0xFF), + (byte) (hash2 >>> 8 & 0xFF), + (byte) (hash2 >>> 16 & 0xFF), + (byte) (hash2 >>> 24 & 0xFF), + (byte) (hash3 & 0xFF), + (byte) (hash3 >>> 8 & 0xFF), + (byte) (hash3 >>> 16 & 0xFF), + (byte) (hash3 >>> 24 & 0xFF) + }; + } + + private static void updateSalt() { + seed = generateSalt(); + + int delay = (int) (1000 * (300 + 600*Math.random())); + Schedulers.computation().scheduleDirect(ContextScrambler::updateSalt, delay, TimeUnit.MILLISECONDS); + } + +} diff --git a/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java index 0a9005fe..bb245a6d 100644 --- a/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java +++ b/index/index-reverse/src/main/java/nu/marginalia/index/reverse/ReverseIndexConverter.java @@ -93,7 +93,7 @@ public class ReverseIndexConverter { // Sort each segment of the intermediate file { - LongArray intermediateDocs = LongArray.mmapForWriting(intermediateUrlsFile); + LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile); wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> { intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexParameters.ENTRY_SIZE, s, e); return e; diff --git a/libraries/array/readme.md b/libraries/array/readme.md new file mode 100644 index 00000000..4a34132d --- /dev/null +++ b/libraries/array/readme.md @@ -0,0 +1,26 @@ +# Array Library + +The array library offers easy allocation of large memory mapped files with less +performance overhead than the traditional `buffers[i].get(j)`-style constructions +java often leads to due to its ByteBuffer size limitation. + +It's a very C++-style library that does unidiomatic things with interface default +functions to get diamond inheritance. + +# Quick demo: +``` +var array = + LongArray.mmapForWriting(Path.of("/tmp/test"), 1<<16); + +array.transformEach(50, 1000, (pos, val) -> Long.hashCode(pos)); +array.quickSort(50, 1000); +if (array.binarySearch(array.get(100), 50, 1000) >= 0) { + System.out.println("Nevermind, I found it!"); +} + +array.range(50, 1000).fill(0, 950, 1); +array.forEach(0, 100, (pos, val) -> { + System.out.println(pos + ":" + val); +}); + +``` \ No newline at end of file diff --git a/libraries/array/src/main/java/nu/marginalia/array/LongArray.java b/libraries/array/src/main/java/nu/marginalia/array/LongArray.java index 82a43bb1..8907ff80 100644 --- a/libraries/array/src/main/java/nu/marginalia/array/LongArray.java +++ b/libraries/array/src/main/java/nu/marginalia/array/LongArray.java @@ -41,7 +41,8 @@ public interface LongArray extends LongArrayBase, LongArrayTransformations, Long return PagingLongArray.mapFileReadOnly(DEFAULT_PARTITIONING_SCHEME, path); } - static LongArray mmapForWriting(Path path) throws IOException { + /** Map an existing file for writing */ + static LongArray mmapForModifying(Path path) throws IOException { return PagingLongArray.mapFileReadWrite(DEFAULT_PARTITIONING_SCHEME, path); } diff --git a/libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java b/libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java index 6b86d8e5..bcb1f367 100644 --- a/libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java +++ b/libraries/array/src/test/java/nu/marginalia/array/PagingIntArrayTest.java @@ -1,5 +1,6 @@ package nu.marginalia.array; +import nu.marginalia.array.algo.SortingContext; import nu.marginalia.array.page.PagingIntArray; import nu.marginalia.array.page.PagingLongArray; import nu.marginalia.array.scheme.SequentialPartitioningScheme; @@ -29,6 +30,22 @@ class PagingIntArrayTest { TestUtil.clearTempDir(tempDir); } + @Test + public void demo() throws IOException { + var array = + LongArray.mmapForWriting(Path.of("/tmp/test"), 1<<16); + + array.transformEach(50, 1000, (pos, val) -> Long.hashCode(pos)); + array.quickSort(50, 1000); + if (array.binarySearch(array.get(100), 50, 1000) >= 0) { + System.out.println("Nevermind, I found it!"); + } + array.range(50, 1000).fill(0, 950, 1); + array.forEach(0, 100, (pos, val) -> { + System.out.println(pos + ":" + val); + }); + + } @Test public void testReadLoad() throws IOException { SequentialPartitioningScheme partitioningScheme = new SequentialPartitioningScheme(7); diff --git a/run/nginx-site.conf b/run/nginx-site.conf new file mode 100644 index 00000000..3b339af5 --- /dev/null +++ b/run/nginx-site.conf @@ -0,0 +1,37 @@ +server { + listen 80; + listen [::]:80; + server_name nginx; + + proxy_set_header X-Context $remote_addr-$connection; + proxy_set_header X-Extern-Url $scheme://$host$request_uri; + proxy_set_header X-Extern-Domain $scheme://$host; + proxy_set_header X-User-Agent $http_user_agent; + + proxy_set_header X-Public "1"; + + rewrite ^/shuffle/$ /search?query=browse:random&profile=yolo; + rewrite ^/explore/(.*)$ /search?query=browse:$1&profile=yolo; + rewrite ^/links/(.*)$ /search?query=links:$1&profile=corpo; + + + location /screenshot { + proxy_pass http://assistant-service:5025/public$uri; + } + location /site-search { + proxy_pass http://search-service:5023/public/site-search; + } + location /site/suggest { + proxy_pass http://search-service:5023/public/site/suggest; + } + location /site/flag-site { + proxy_pass http://search-service:5023/public/site/flag-site; + } + location /site/ { + rewrite ^/site/(.*)$ /search?query=site:$1&profile=yolo; + } + location / { + proxy_pass http://search-service:5023/public/; + } + +} diff --git a/services-core/index-service/readme.md b/services-core/index-service/readme.md new file mode 100644 index 00000000..cf42bb53 --- /dev/null +++ b/services-core/index-service/readme.md @@ -0,0 +1,10 @@ +# Index Service + +The index service knows which document contains which keywords. + +## Central Classes + +* [IndexService](src/main/java/nu/marginalia/index/IndexService.java) is the REST entry point that the internal API talks to. +* [IndexQueryService](src/main/java/nu/marginalia/index/svc/IndexQueryService.java) executes queries. +* [SearchIndex](src/main/java/nu/marginalia/index/index/SearchIndex.java) owns the state of the index and helps with building a query strategy from parameters. +* [IndexResultValuator](src/main/java/nu/marginalia/index/results/IndexResultValuator.java) determines the best results. \ No newline at end of file diff --git a/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java index a6eb43bc..7962c9ab 100644 --- a/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java +++ b/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexResultValuator.java @@ -19,7 +19,6 @@ import java.util.OptionalInt; public class IndexResultValuator { private final IndexMetadataService metadataService; - private final SearchTermsService searchTermsSvc; private final List> searchTermVariants; private final IndexQueryParams queryParams; private final int[] termIdsAll; @@ -34,7 +33,6 @@ public class IndexResultValuator { TLongList results, List subqueries, IndexQueryParams queryParams) { - this.searchTermsSvc = searchTermsSvc; this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); this.queryParams = queryParams; this.metadataService = metadataService; diff --git a/services-core/search-service/readme.md b/services-core/search-service/readme.md new file mode 100644 index 00000000..fdd5974e --- /dev/null +++ b/services-core/search-service/readme.md @@ -0,0 +1,16 @@ +# Search Service + +This service handles search traffic and is the service +you're most directly interacting with when visiting +[search.marginalia.nu](https://search.marginalia.nu). + +## Central classes + +* [SearchService](src/main/java/nu/marginalia/search/SearchService.java) receives REST requests and delegates to the +appropriate services. + +* [CommandEvaluator](src/main/java/nu/marginalia/search/command/CommandEvaluator.java) interprets a search query and acts +upon it, dealing with special operations like `browse:` or `site:`. + +* [SearchOperator](src/main/java/nu/marginalia/search/SearchOperator.java) parses a search query, passes it to the index service, and +then decorates the search results so that they can be rendered.