From 8ae1f080956ee72a3d5f8a4a0971376ea242d3d5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 12 Mar 2024 13:12:50 +0100 Subject: [PATCH 01/47] (WIP) Implement first take of new query segmentation algorithm --- code/functions/search-query/build.gradle | 5 + .../segmentation/BasicSentenceExtractor.java | 16 ++ .../searchquery/segmentation/HasherGroup.java | 61 +++++++ .../segmentation/NgramExporterMain.java | 46 +++++ .../segmentation/NgramExtractorMain.java | 113 ++++++++++++ .../segmentation/NgramLexicon.java | 165 ++++++++++++++++++ .../segmentation/HasherGroupTest.java | 33 ++++ .../segmentation/NgramLexiconTest.java | 53 ++++++ 8 files changed, 492 insertions(+) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java diff --git a/code/functions/search-query/build.gradle b/code/functions/search-query/build.gradle index 86cafefa..76c520fb 100644 --- a/code/functions/search-query/build.gradle +++ b/code/functions/search-query/build.gradle @@ -26,6 +26,9 @@ dependencies { implementation project(':code:libraries:term-frequency-dict') implementation project(':third-party:porterstemmer') + implementation project(':third-party:openzim') + implementation project(':third-party:commons-codec') + implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:features-convert:keyword-extraction') @@ -36,6 +39,8 @@ dependencies { implementation libs.bundles.grpc implementation libs.notnull implementation libs.guice + implementation libs.jsoup + implementation libs.commons.lang3 implementation libs.trove implementation libs.fastutil implementation libs.bundles.gson diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java new file mode 100644 index 00000000..e65c243d --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java @@ -0,0 +1,16 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import ca.rmen.porterstemmer.PorterStemmer; +import org.apache.commons.lang3.StringUtils; + +public class BasicSentenceExtractor { + + private static PorterStemmer porterStemmer = new PorterStemmer(); + public static String[] getStemmedParts(String sentence) { + String[] parts = StringUtils.split(sentence, ' '); + for (int i = 0; i < parts.length; i++) { + parts[i] = porterStemmer.stemWord(parts[i]); + } + return parts; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java new file mode 100644 index 00000000..60bbb4dd --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java @@ -0,0 +1,61 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import nu.marginalia.hash.MurmurHash3_128; + +/** A group of hash functions that can be used to hash a sequence of strings, + * that also has an inverse operation that can be used to remove a previously applied + * string from the sequence. */ +sealed interface HasherGroup { + /** Apply a hash to the accumulator */ + long apply(long acc, long add); + + /** Remove a hash that was added n operations ago from the accumulator, add a new one */ + long replace(long acc, long add, long rem, int n); + + /** Create a new hasher group that preserves the order of appleid hash functions */ + static HasherGroup ordered() { + return new OrderedHasher(); + } + + /** Create a new hasher group that does not preserve the order of applied hash functions */ + static HasherGroup unordered() { + return new UnorderedHasher(); + } + + /** Bake the words in the sentence into a hash successively using the group's apply function */ + default long rollingHash(String[] parts) { + long code = 0; + for (String part : parts) { + code = apply(code, hash(part)); + } + return code; + } + + MurmurHash3_128 hash = new MurmurHash3_128(); + /** Calculate the hash of a string */ + static long hash(String term) { + return hash.hashNearlyASCII(term); + } + + final class UnorderedHasher implements HasherGroup { + + public long apply(long acc, long add) { + return acc ^ add; + } + + public long replace(long acc, long add, long rem, int n) { + return acc ^ rem ^ add; + } + } + + final class OrderedHasher implements HasherGroup { + + public long apply(long acc, long add) { + return Long.rotateLeft(acc, 1) ^ add; + } + + public long replace(long acc, long add, long rem, int n) { + return Long.rotateLeft(acc, 1) ^ add ^ Long.rotateLeft(rem, n); + } + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java new file mode 100644 index 00000000..087345f6 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java @@ -0,0 +1,46 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.SentenceExtractor; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Scanner; + +public class NgramExporterMain { + + public static void main(String... args) throws IOException { + trial(); + } + + static void trial() throws IOException { + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + NgramLexicon lexicon = new NgramLexicon(); + lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin")); + + System.out.println("Loaded!"); + + var scanner = new Scanner(System.in); + for (;;) { + System.out.println("Enter a sentence: "); + String line = scanner.nextLine(); + System.out.println("."); + if (line == null) + break; + + String[] terms = BasicSentenceExtractor.getStemmedParts(line); + System.out.println("."); + + for (int i = 2; i< 8; i++) { + lexicon.findSegments(i, terms).forEach(p -> { + System.out.println(STR."\{Arrays.toString(p.project(terms))}: \{p.count()}"); + }); + } + + } + } + + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java new file mode 100644 index 00000000..0339b2c1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java @@ -0,0 +1,113 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import it.unimi.dsi.fastutil.longs.*; +import nu.marginalia.hash.MurmurHash3_128; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.openzim.ZIMTypes.ZIMFile; +import org.openzim.ZIMTypes.ZIMReader; + +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executors; + +public class NgramExtractorMain { + static MurmurHash3_128 hash = new MurmurHash3_128(); + + public static void main(String... args) { + } + + private static List getNgramTerms(Document document) { + List terms = new ArrayList<>(); + + document.select("a[href]").forEach(e -> { + var href = e.attr("href"); + if (href.contains(":")) + return; + if (href.contains("/")) + return; + + var text = e.text().toLowerCase(); + if (!text.contains(" ")) + return; + + terms.add(text); + }); + + return terms; + } + + public static void dumpNgramsList( + Path zimFile, + Path ngramFile + ) throws IOException, InterruptedException { + ZIMReader reader = new ZIMReader(new ZIMFile(zimFile.toString())); + + PrintWriter printWriter = new PrintWriter(Files.newOutputStream(ngramFile, + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)); + + LongOpenHashSet known = new LongOpenHashSet(); + + try (var executor = Executors.newWorkStealingPool()) { + reader.forEachArticles((title, body) -> { + executor.submit(() -> { + var terms = getNgramTerms(Jsoup.parse(body)); + synchronized (known) { + for (String term : terms) { + if (known.add(hash.hashNearlyASCII(term))) { + printWriter.println(term); + } + } + } + }); + + }, p -> true); + } + printWriter.close(); + } + + public static void dumpCounts(Path zimInputFile, + Path countsOutputFile) throws IOException, InterruptedException + { + ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); + + NgramLexicon lexicon = new NgramLexicon(); + + var orderedHasher = HasherGroup.ordered(); + var unorderedHasher = HasherGroup.unordered(); + + try (var executor = Executors.newWorkStealingPool()) { + reader.forEachArticles((title, body) -> { + executor.submit(() -> { + LongArrayList orderedHashes = new LongArrayList(); + LongArrayList unorderedHashes = new LongArrayList(); + + for (var sent : getNgramTerms(Jsoup.parse(body))) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + + orderedHashes.add(orderedHasher.rollingHash(terms)); + unorderedHashes.add(unorderedHasher.rollingHash(terms)); + } + + synchronized (lexicon) { + for (var hash : orderedHashes) { + lexicon.incOrdered(hash); + } + for (var hash : unorderedHashes) { + lexicon.addUnordered(hash); + } + } + }); + + }, p -> true); + } + + lexicon.saveCounts(countsOutputFile); + } + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java new file mode 100644 index 00000000..948347bf --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -0,0 +1,165 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; +import it.unimi.dsi.fastutil.longs.LongHash; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +class NgramLexicon { + private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( + 100_000_000, + new KeyIsAlreadyHashStrategy() + ); + private final LongOpenHashSet permutations = new LongOpenHashSet(); + + private static final HasherGroup orderedHasher = HasherGroup.ordered(); + private static final HasherGroup unorderedHasher = HasherGroup.unordered(); + + public List findSegments(int length, String... parts) { + // Don't look for ngrams longer than the sentence + if (parts.length < length) return List.of(); + + List positions = new ArrayList<>(); + + // Hash the parts + long[] hashes = new long[parts.length]; + for (int i = 0; i < hashes.length; i++) { + hashes[i] = HasherGroup.hash(parts[i]); + } + + long ordered = 0; + long unordered = 0; + int i = 0; + + // Prepare by combining up to length hashes + for (; i < length; i++) { + ordered = orderedHasher.apply(ordered, hashes[i]); + unordered = unorderedHasher.apply(unordered, hashes[i]); + } + + // Slide the window and look for matches + for (;; i++) { + int ct = counts.get(ordered); + + if (ct > 0) { + positions.add(new SentenceSegment(i - length, length, ct, PositionType.NGRAM)); + } + else if (permutations.contains(unordered)) { + positions.add(new SentenceSegment(i - length, length, 0, PositionType.PERMUTATION)); + } + + if (i >= hashes.length) + break; + + // Remove the oldest hash and add the new one + ordered = orderedHasher.replace(ordered, + hashes[i], + hashes[i - length], + length); + unordered = unorderedHasher.replace(unordered, + hashes[i], + hashes[i - length], + length); + } + + return positions; + } + + public void incOrdered(long hashOrdered) { + counts.addTo(hashOrdered, 1); + } + public void addUnordered(long hashUnordered) { + permutations.add(hashUnordered); + } + + public void loadCounts(Path path) throws IOException { + try (var dis = new DataInputStream(Files.newInputStream(path))) { + long size = dis.readInt(); + + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } + } + } + + public void loadPermutations(Path path) throws IOException { + try (var dis = new DataInputStream(Files.newInputStream(path))) { + long size = dis.readInt(); + + for (int i = 0; i < size; i++) { + permutations.add(dis.readLong()); + } + } + } + + public void saveCounts(Path file) throws IOException { + try (var dos = new DataOutputStream(Files.newOutputStream(file, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE))) { + dos.writeInt(counts.size()); + + counts.forEach((k, v) -> { + try { + dos.writeLong(k); + dos.writeInt(v); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + public void savePermutations(Path file) throws IOException { + try (var dos = new DataOutputStream(Files.newOutputStream(file, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE))) { + dos.writeInt(counts.size()); + + permutations.forEach(v -> { + try { + dos.writeLong(v); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + public void clear() { + permutations.clear(); + counts.clear(); + } + + public record SentenceSegment(int start, int length, int count, PositionType type) { + public String[] project(String... parts) { + return Arrays.copyOfRange(parts, start, start + length); + } + } + + enum PositionType { + NGRAM, PERMUTATION + } + + private static class KeyIsAlreadyHashStrategy implements LongHash.Strategy { + @Override + public int hashCode(long l) { + return (int) l; + } + + @Override + public boolean equals(long l, long l1) { + return l == l1; + } + } + +} + diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java new file mode 100644 index 00000000..174bd553 --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class HasherGroupTest { + + @Test + void ordered() { + long a = 5; + long b = 3; + long c = 2; + + var group = HasherGroup.ordered(); + assertNotEquals(group.apply(a, b), group.apply(b, a)); + assertEquals(group.apply(b,c), group.replace(group.apply(a, b), c, a, 2)); + } + + @Test + void unordered() { + long a = 5; + long b = 3; + long c = 2; + + var group = HasherGroup.unordered(); + + assertEquals(group.apply(a, b), group.apply(b, a)); + assertEquals(group.apply(b, c), group.replace(group.apply(a, b), c, a, 2)); + } + + +} diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java new file mode 100644 index 00000000..28b9ef2f --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java @@ -0,0 +1,53 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class NgramLexiconTest { + NgramLexicon lexicon = new NgramLexicon(); + @BeforeEach + public void setUp() { + lexicon.clear(); + } + + void addNgram(String... ngram) { + lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); + lexicon.addUnordered(HasherGroup.unordered().rollingHash(ngram)); + } + + @Test + void findSegments() { + addNgram("hello", "world"); + addNgram("rye", "bread"); + addNgram("rye", "world"); + + String[] sent = { "hello", "world", "rye", "bread" }; + var segments = lexicon.findSegments(2, "hello", "world", "rye", "bread"); + + assertEquals(3, segments.size()); + + for (int i = 0; i < 3; i++) { + var segment = segments.get(i); + switch (i) { + case 0 -> { + assertArrayEquals(new String[]{"hello", "world"}, segment.project(sent)); + assertEquals(1, segment.count()); + assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + } + case 1 -> { + assertArrayEquals(new String[]{"world", "rye"}, segment.project(sent)); + assertEquals(0, segment.count()); + assertEquals(NgramLexicon.PositionType.PERMUTATION, segment.type()); + } + case 2 -> { + assertArrayEquals(new String[]{"rye", "bread"}, segment.project(sent)); + assertEquals(1, segment.count()); + assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + } + } + } + + } +} \ No newline at end of file From 07e4d7ec6d2806dc65a1916bdb5cb16d8f8deced Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:16:00 +0100 Subject: [PATCH 02/47] (WIP) Improve data extraction from wikipedia data --- .../segmentation/NgramExtractorMain.java | 54 +++++++++++++++++-- .../segmentation/NgramLexicon.java | 2 +- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java index 0339b2c1..4cd4b296 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java @@ -22,9 +22,15 @@ public class NgramExtractorMain { public static void main(String... args) { } - private static List getNgramTerms(Document document) { + private static List getNgramTerms(String title, Document document) { List terms = new ArrayList<>(); + // Add the title + if (title.contains(" ")) { + terms.add(title.toLowerCase()); + } + + // Grab all internal links document.select("a[href]").forEach(e -> { var href = e.attr("href"); if (href.contains(":")) @@ -39,6 +45,43 @@ public class NgramExtractorMain { terms.add(text); }); + // Grab all italicized text + document.getElementsByTag("i").forEach(e -> { + var text = e.text().toLowerCase(); + if (!text.contains(" ")) + return; + + terms.add(text); + }); + + // Trim the discovered terms + terms.replaceAll(s -> { + + // Remove trailing parentheses and their contents + if (s.endsWith(")")) { + int idx = s.lastIndexOf('('); + if (idx > 0) { + return s.substring(0, idx).trim(); + } + } + + // Remove leading "list of " + if (s.startsWith("list of ")) { + return s.substring("list of ".length()); + } + + return s; + }); + + // Remove terms that are too short or too long + terms.removeIf(s -> { + if (!s.contains(" ")) + return true; + if (s.length() > 64) + return true; + return false; + }); + return terms; } @@ -56,7 +99,7 @@ public class NgramExtractorMain { try (var executor = Executors.newWorkStealingPool()) { reader.forEachArticles((title, body) -> { executor.submit(() -> { - var terms = getNgramTerms(Jsoup.parse(body)); + var terms = getNgramTerms(title, Jsoup.parse(body)); synchronized (known) { for (String term : terms) { if (known.add(hash.hashNearlyASCII(term))) { @@ -72,7 +115,9 @@ public class NgramExtractorMain { } public static void dumpCounts(Path zimInputFile, - Path countsOutputFile) throws IOException, InterruptedException + Path countsOutputFile, + Path permutationsOutputFile + ) throws IOException, InterruptedException { ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); @@ -87,7 +132,7 @@ public class NgramExtractorMain { LongArrayList orderedHashes = new LongArrayList(); LongArrayList unorderedHashes = new LongArrayList(); - for (var sent : getNgramTerms(Jsoup.parse(body))) { + for (var sent : getNgramTerms(title, Jsoup.parse(body))) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); orderedHashes.add(orderedHasher.rollingHash(terms)); @@ -108,6 +153,7 @@ public class NgramExtractorMain { } lexicon.saveCounts(countsOutputFile); + lexicon.savePermutations(permutationsOutputFile); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java index 948347bf..f8044e12 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -14,7 +14,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -class NgramLexicon { +public class NgramLexicon { private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( 100_000_000, new KeyIsAlreadyHashStrategy() From 00ef4f98031efb6d0a4d725124e46ebaab37609e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:16:49 +0100 Subject: [PATCH 03/47] (WIP) Partial integration of new query expansion code into the query-serivice --- .../query_parser/QueryVariants.java | 187 +------------- .../variant/ExpansionStrategy.java | 7 + .../query_parser/variant/QueryExpansion.java | 111 ++++++++ .../query_parser/variant/QueryVariant.java | 17 ++ .../query_parser/variant/QueryVariantSet.java | 21 ++ .../query_parser/variant/QueryWord.java | 10 + .../query_parser/variant/VariantStrategy.java | 8 + .../query_parser/variant/model/QWord.java | 47 ++++ .../variant/model/QWordGraph.java | 236 ++++++++++++++++++ .../variant/strategy/CombineDashes.java | 40 +++ .../variant/strategy/JoinTerms.java | 58 +++++ .../variant/strategy/SplitWordNum.java | 65 +++++ .../searchquery/svc/QueryFactory.java | 11 +- .../variant/model/QWordGraphTest.java | 33 +++ .../query/svc/QueryFactoryTest.java | 3 +- 15 files changed, 666 insertions(+), 188 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java index 9732e53f..10648486 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java @@ -1,17 +1,14 @@ package nu.marginalia.functions.searchquery.query_parser; -import ca.rmen.porterstemmer.PorterStemmer; -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.LanguageModels; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; @@ -22,17 +19,13 @@ import java.util.regex.Pattern; public class QueryVariants { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; - private final PorterStemmer ps = new PorterStemmer(); - private final NGramBloomFilter nGramBloomFilter; private final EnglishDictionary englishDictionary; private final ThreadLocal sentenceExtractor; public QueryVariants(LanguageModels lm, TermFrequencyDict dict, - NGramBloomFilter nGramBloomFilter, EnglishDictionary englishDictionary) { - this.nGramBloomFilter = nGramBloomFilter; this.englishDictionary = englishDictionary; this.keywordExtractor = new KeywordExtractor(); this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); @@ -40,33 +33,6 @@ public class QueryVariants { } - final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - final Pattern dashBoundary = Pattern.compile("-"); - - @AllArgsConstructor - private static class Word { - public final String stemmed; - public final String word; - public final String wordOriginal; - } - - @AllArgsConstructor @Getter @ToString @EqualsAndHashCode - public static class QueryVariant { - public final List terms; - public final double value; - } - - @Getter @ToString - public static class QueryVariantSet { - final List faithful = new ArrayList<>(); - final List alternative = new ArrayList<>(); - - final List nonLiterals = new ArrayList<>(); - - public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); - } - } public QueryVariantSet getQueryVariants(List query) { final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); @@ -108,19 +74,11 @@ public class QueryVariants { byStart.put(0, elongatedFirstWords); } - final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); + final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); List> faithfulQueries = new ArrayList<>(); List> alternativeQueries = new ArrayList<>(); - for (var ls : goodSpans) { - faithfulQueries.addAll(createTokens(ls)); - } - - for (var span : goodSpans) { - alternativeQueries.addAll(joinTerms(span)); - } - for (var ls : goodSpans) { var last = ls.get(ls.size() - 1); @@ -174,105 +132,8 @@ public class QueryVariants { return ret; } - private Collection> createTokens(List ls) { - List asTokens = new ArrayList<>(); - List> ret = new ArrayList<>(); - - - boolean dash = false; - boolean num = false; - - for (var span : ls) { - dash |= dashBoundary.matcher(span.word).find(); - num |= numWordBoundary.matcher(span.word).find(); - if (ls.size() == 1 || !isOmittableWord(span.word)) { - asTokens.add(span.word); - } - } - ret.add(asTokens); - - if (dash) { - ret.addAll(combineDashWords(ls)); - } - - if (num) { - ret.addAll(splitWordNum(ls)); - } - - return ret; - } - - private boolean isOmittableWord(String word) { - return switch (word) { - case "vs", "or", "and", "versus", "is", "the", "why", "when", "if", "who", "are", "am" -> true; - default -> false; - }; - } - - private Collection> splitWordNum(List ls) { - List asTokens2 = new ArrayList<>(); - - boolean num = false; - - for (var span : ls) { - var wordMatcher = numWordBoundary.matcher(span.word); - var stemmedMatcher = numWordBoundary.matcher(span.stemmed); - - int ws = 0; - int ss = 0; - boolean didSplit = false; - while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { - ws = wordMatcher.start()+1; - ss = stemmedMatcher.start()+1; - if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) - || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) - { - String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); - asTokens2.add(combined); - didSplit = true; - num = true; - } - } - - if (!didSplit) { - asTokens2.add(span.word); - } - } - - if (num) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private Collection> combineDashWords(List ls) { - List asTokens2 = new ArrayList<>(); - boolean dash = false; - - for (var span : ls) { - var matcher = dashBoundary.matcher(span.word); - if (matcher.find() && nGramBloomFilter.isKnownNGram(ps.stemWord(dashBoundary.matcher(span.word).replaceAll("")))) { - dash = true; - String combined = dashBoundary.matcher(span.word).replaceAll(""); - asTokens2.add(combined); - } - else { - asTokens2.add(span.word); - } - } - - if (dash) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private String splitAtNumBoundary(String in, int splitPoint, String joiner) { - return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); - } - - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { - List> goodSpans = new ArrayList<>(); + private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { + List> goodSpans = new ArrayList<>(); for (int i = 0; i < 1; i++) { var spans = byStart.get(i); @@ -298,9 +159,9 @@ public class QueryVariants { int end = span.get(span.size()-1).end; if (end == sentence.length()) { - var gs = new ArrayList(span.size()); + var gs = new ArrayList(span.size()); for (var s : span) { - gs.add(new Word(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), + gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), s.size() == 1 ? sentence.words[s.start] : "")); } goodSpans.add(gs); @@ -325,38 +186,6 @@ public class QueryVariants { return goodSpans; } - private List> joinTerms(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = ps.stemWord(a.word + b.word); - - double scoreCombo = dict.getTermFreqStemmed(stemmed); - if (scoreCombo > 10000) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = a.word + b.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - } - - return ret; - } private JoinedQueryAndNonLiteralTokens joinQuery(List query) { StringJoiner s = new StringJoiner(" "); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java new file mode 100644 index 00000000..18987aea --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; + +public interface ExpansionStrategy { + void expand(QWordGraph graph); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java new file mode 100644 index 00000000..faac81d4 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -0,0 +1,111 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; +import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import org.apache.commons.lang3.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class QueryExpansion { + private static final PorterStemmer ps = new PorterStemmer(); + private final TermFrequencyDict dict; + private final NgramLexicon lexicon; + List expansionStrategies = List.of( + this::joinDashes, + this::splitWordNum, + this::joinTerms, + this::createSegments + ); + + public QueryExpansion(TermFrequencyDict dict, + NgramLexicon lexicon + ) { + this.dict = dict; + this.lexicon = lexicon; + } + + public QWordGraph expandQuery(List words) { + + QWordGraph graph = new QWordGraph(words); + + for (var strategy : expansionStrategies) { + strategy.expand(graph); + } + + return null; + } + + private static final Pattern dashPattern = Pattern.compile("-"); + private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); + + // Turn 'lawn-chair' into 'lawnchair' + public void joinDashes(QWordGraph graph) { + for (var qw : graph) { + if (qw.word().contains("-")) { + var joined = StringUtils.join(dashPattern.split(qw.word())); + graph.addVariant(qw, joined); + } + } + } + + + // Turn 'MP3' into 'MP-3' + public void splitWordNum(QWordGraph graph) { + for (var qw : graph) { + var matcher = numWordBoundary.matcher(qw.word()); + if (matcher.matches()) { + var joined = StringUtils.join(dashPattern.split(qw.word()), '-'); + graph.addVariant(qw, joined); + } + } + } + + // Turn 'lawn chair' into 'lawnchair' + public void joinTerms(QWordGraph graph) { + QWord prev = null; + + for (var qw : graph) { + if (prev != null) { + var joinedWord = prev.word() + qw.word(); + var joinedStemmed = ps.stemWord(joinedWord); + + var scoreA = dict.getTermFreqStemmed(prev.stemmed()); + var scoreB = dict.getTermFreqStemmed(qw.stemmed()); + + var scoreCombo = dict.getTermFreqStemmed(joinedStemmed); + + if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) { + graph.addVariantForSpan(prev, qw, joinedWord); + } + } + + prev = qw; + } + } + + public void createSegments(QWordGraph graph) { + List nodes = new ArrayList<>(); + + for (var qw : graph) { + nodes.add(qw); + } + + String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + + for (int length = 2; length < Math.min(10, words.length); length++) { + for (var segment : lexicon.findSegments(length, words)) { + int start = segment.start(); + int end = segment.start() + segment.length(); + var word = StringUtils.join(words, "_", start, end); + + graph.addVariantForSpan(nodes.get(start), nodes.get(end), word); + } + } + } + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java new file mode 100644 index 00000000..8d24387b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java @@ -0,0 +1,17 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; + +import java.util.List; + +@AllArgsConstructor +@Getter +@ToString +@EqualsAndHashCode +public class QueryVariant { + public final List terms; + public final double value; +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java new file mode 100644 index 00000000..b01fbd5e --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java @@ -0,0 +1,21 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.Getter; +import lombok.ToString; +import nu.marginalia.functions.searchquery.query_parser.token.Token; + +import java.util.ArrayList; +import java.util.List; + +@Getter +@ToString +public class QueryVariantSet { + public final List faithful = new ArrayList<>(); + public final List alternative = new ArrayList<>(); + + public final List nonLiterals = new ArrayList<>(); + + public boolean isEmpty() { + return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java new file mode 100644 index 00000000..9c158a43 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java @@ -0,0 +1,10 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.AllArgsConstructor; + +@AllArgsConstructor +public class QueryWord { + public final String stemmed; + public final String word; + public final String wordOriginal; +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java new file mode 100644 index 00000000..2c1a5bfb --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java @@ -0,0 +1,8 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import java.util.Collection; +import java.util.List; + +public interface VariantStrategy { + Collection> constructVariants(List ls); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java new file mode 100644 index 00000000..07f65c95 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java @@ -0,0 +1,47 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import ca.rmen.porterstemmer.PorterStemmer; + +public record QWord( + int ord, + boolean variant, + String stemmed, + String word, + String original) +{ + + // These are special words that are not in the input, but are added to the graph, + // note the space around the ^ and $, to avoid collisions with real words + private static final String BEG_MARKER = " ^ "; + private static final String END_MARKER = " $ "; + + private static final PorterStemmer ps = new PorterStemmer(); + + public boolean isBeg() { + return word.equals(BEG_MARKER); + } + + public boolean isEnd() { + return word.equals(END_MARKER); + } + + public static QWord beg() { + return new QWord(Integer.MIN_VALUE, false, BEG_MARKER, BEG_MARKER, BEG_MARKER); + } + + public static QWord end() { + return new QWord(Integer.MAX_VALUE, false, END_MARKER, END_MARKER, END_MARKER); + } + + public boolean isOriginal() { + return !variant; + } + + public QWord(int ord, String word) { + this(ord, false, ps.stemWord(word), word, word); + } + + public QWord(int ord, QWord original, String word) { + this(ord, true, ps.stemWord(word), word, original.original); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java new file mode 100644 index 00000000..f9902733 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java @@ -0,0 +1,236 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import org.jetbrains.annotations.NotNull; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** Graph structure for constructing query variants. The graph should be a directed acyclic graph, + * with a single start node and a single end node, denoted by QWord.beg() and QWord.end() respectively. + *

+ * Naively, every path from the start to the end node should represent a valid query variant, although in + * practice it is desirable to be clever about how to evaluate the paths, to avoid combinatorial explosion. + */ +public class QWordGraph implements Iterable { + + + public record QWordGraphLink(QWord from, QWord to) { + } + + private final List links = new ArrayList<>(); + private final Map> fromTo = new HashMap<>(); + private final Map> toFrom = new HashMap<>(); + + private int wordId = 0; + + public QWordGraph(String... words) { + this(List.of(words)); + } + + public QWordGraph(List words) { + QWord beg = QWord.beg(); + QWord end = QWord.end(); + + var prev = beg; + + for (String s : words) { + var word = new QWord(wordId++, s); + addLink(prev, word); + prev = word; + } + + addLink(prev, end); + } + + public void addVariant(QWord original, String word) { + var siblings = getVariants(original); + if (siblings.stream().anyMatch(w -> w.word().equals(word))) + return; + + var newWord = new QWord(wordId++, original, word); + + for (var prev : getPrev(original)) + addLink(prev, newWord); + for (var next : getNext(original)) + addLink(newWord, next); + } + + public void addVariantForSpan(QWord first, QWord last, String word) { + var newWord = new QWord(wordId++, first, word); + + for (var prev : getPrev(first)) + addLink(prev, newWord); + for (var next : getNext(last)) + addLink(newWord, next); + } + + public List getVariants(QWord original) { + var prevNext = getPrev(original).stream() + .flatMap(prev -> getNext(prev).stream()) + .collect(Collectors.toSet()); + + return getNext(original).stream() + .flatMap(next -> getPrev(next).stream()) + .filter(prevNext::contains) + .collect(Collectors.toList()); + } + + + public void addLink(QWord from, QWord to) { + links.add(new QWordGraphLink(from, to)); + fromTo.computeIfAbsent(from, k -> new ArrayList<>()).add(to); + toFrom.computeIfAbsent(to, k -> new ArrayList<>()).add(from); + } + + public List links() { + return Collections.unmodifiableList(links); + } + public List nodes() { + return links.stream() + .flatMap(l -> Stream.of(l.from(), l.to())) + .sorted(Comparator.comparing(QWord::ord)) + .distinct() + .collect(Collectors.toList()); + } + + + public List getNext(QWord word) { + return fromTo.getOrDefault(word, List.of()); + } + public List getNextOriginal(QWord word) { + return fromTo.getOrDefault(word, List.of()) + .stream() + .filter(QWord::isOriginal) + .toList(); + } + + public List getPrev(QWord word) { + return toFrom.getOrDefault(word, List.of()); + } + public List getPrevOriginal(QWord word) { + return toFrom.getOrDefault(word, List.of()) + .stream() + .filter(QWord::isOriginal) + .toList(); + } + + // Returns true if removing the word would disconnect the graph + // so that there is no path from 'begin' to 'end'. This is useful + // in breaking up the graph into smaller component subgraphs, and + // understanding which vertexes can be re-ordered without changing + // the semantics of the encoded query. + public boolean isBypassed(QWord word, QWord begin, QWord end) { + assert word.isOriginal() : "Can only bypass original words"; + + Set edge = new HashSet<>(); + Set visited = new HashSet<>(); + + edge.add(begin); + + while (!edge.isEmpty()) { + Set next = new HashSet<>(); + + for (var w : edge) { + // Skip the word we're trying find a bypassing route for + if (w.ord() == word.ord()) + continue; + + if (Objects.equals(w, end)) + return true; + + next.addAll(getNext(w)); + } + + next.removeAll(visited); + visited.addAll(next); + edge = next; + } + + return false; + } + + /** Returns a set of all nodes that are between 'begin' and 'end' in the graph, + * including the terminal nodes. This is useful for breaking up the graph into + * smaller components that can be evaluated in any order. + *

+ * It is assumed that there is a path from 'begin' to 'end' in the graph, and no + * other paths that bypass 'end'. + *

+ * The nodes are returned in the order they are encountered in a breadth-first search. + */ + public List nodesBetween(QWord begin, QWord end) { + List edge = new ArrayList<>(); + List visited = new ArrayList<>(); + + edge.add(begin); + + while (!edge.isEmpty()) { + List next = new ArrayList<>(); + + for (var w : edge) { + if (Objects.equals(w, end)) + continue; + + assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex"; + + next.addAll(getNext(w)); + } + + next.removeAll(visited); + visited.addAll(next); + edge = next; + } + + return visited; + } + + /** Returns a list of subgraphs that are connected on the path from + * 'begin' to 'end'. This is useful for breaking up the graph into + * smaller components that can be evaluated in any order. + *

+ * The subgraphs are specified by their predecessor and successor nodes, + * + */ + public List getSubgraphs(QWord begin, QWord end) { + // Short-circuit for the common and simple case + if (getNext(begin).equals(List.of(end))) + return List.of(new QWordGraphLink(begin, end)); + + List subgraphs = new ArrayList<>(); + + List points = nodesBetween(begin, end) + .stream() + .filter(w -> isBypassed(w, begin, end)) + .toList(); + + for (int i = 0; i < points.size() - 1; i++) { + var a = points.get(i); + var b = points.get(i+1); + + subgraphs.add(new QWordGraphLink(a, b)); + } + + return subgraphs; + } + + + @NotNull + @Override + public Iterator iterator() { + return new Iterator<>() { + QWord pos = QWord.beg(); + + @Override + public boolean hasNext() { + return !pos.isEnd(); + } + + @Override + public QWord next() { + pos = getNextOriginal(pos).get(0); + return pos; + } + }; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java new file mode 100644 index 00000000..c24defbe --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java @@ -0,0 +1,40 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** Variant strategy that combines word that have dashes, as sometimes lawn-chair + * gets spelled lawnchair */ +public class CombineDashes implements VariantStrategy { + final Pattern dashBoundary = Pattern.compile("-"); + + public CombineDashes() { + } + + @Override + public Collection> constructVariants(List words) { + List asTokens2 = new ArrayList<>(); + boolean dash = false; + + for (var span : words) { + var matcher = dashBoundary.matcher(span.word); + if (matcher.find()) { + String combined = dashBoundary.matcher(span.word).replaceAll(""); + asTokens2.add(combined); + } + + asTokens2.add(span.word); + } + + if (dash) { + return List.of(asTokens2); + } + return Collections.emptyList(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java new file mode 100644 index 00000000..d03a64d1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java @@ -0,0 +1,58 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** Variant strategy that merges tokens that are adjacent, where the combined token + * has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */ +public class JoinTerms implements VariantStrategy { + private final TermFrequencyDict dict; + private final PorterStemmer ps; + + public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) { + this.dict = dict; + this.ps = ps; + } + + @Override + public Collection> constructVariants(List span) { + List> ret = new ArrayList<>(); + + for (int i = 0; i < span.size()-1; i++) { + var a = span.get(i); + var b = span.get(i+1); + + var stemmed = ps.stemWord(a.word + b.word); + + double scoreCombo = dict.getTermFreqStemmed(stemmed); + + if (scoreCombo > 10000) { + List asTokens = new ArrayList<>(); + + for (int j = 0; j < i; j++) { + var word = span.get(j).word; + asTokens.add(word); + } + { + var word = a.word + b.word; + asTokens.add(word); + } + for (int j = i+2; j < span.size(); j++) { + var word = span.get(j).word; + asTokens.add(word); + } + + ret.add(asTokens); + } + + } + + return ret; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java new file mode 100644 index 00000000..ac79476b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java @@ -0,0 +1,65 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; +import nu.marginalia.util.ngrams.NGramBloomFilter; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** Variant strategy that splits tokens at the boundary between a number and a word. + */ +public class SplitWordNum implements VariantStrategy { + + + final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); + private final NGramBloomFilter nGramBloomFilter; + + public SplitWordNum(NGramBloomFilter nGramBloomFilter) { + this.nGramBloomFilter = nGramBloomFilter; + } + + @Override + public Collection> constructVariants(List ls) { + List asTokens2 = new ArrayList<>(); + + boolean num = false; + + for (var span : ls) { + var wordMatcher = numWordBoundary.matcher(span.word); + var stemmedMatcher = numWordBoundary.matcher(span.stemmed); + + int ws = 0; + int ss = 0; + boolean didSplit = false; + while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { + ws = wordMatcher.start()+1; + ss = stemmedMatcher.start()+1; + if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) + || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) + { + String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); + asTokens2.add(combined); + didSplit = true; + num = true; + } + } + + if (!didSplit) { + asTokens2.add(span.word); + } + } + + if (num) { + return List.of(asTokens2); + } + return Collections.emptyList(); + } + + private String splitAtNumBoundary(String in, int splitPoint, String joiner) { + return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index ac7ce2b2..9ac7c795 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -8,7 +8,6 @@ import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.language.WordPatterns; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; @@ -37,9 +36,8 @@ public class QueryFactory { @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, - EnglishDictionary englishDictionary, - NGramBloomFilter nGramBloomFilter) { - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); + EnglishDictionary englishDictionary) { + this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary)); } @@ -79,7 +77,7 @@ public class QueryFactory { String domain = null; - var basicQuery = queryParser.parse(query); + List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { problems.add("Your search query is too long"); @@ -108,10 +106,9 @@ public class QueryFactory { for (var parts : queryPermutations) { QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts); - SearchSubquery subquery = termsAccumulator.createSubquery(); - domain = termsAccumulator.domain; + SearchSubquery subquery = termsAccumulator.createSubquery(); subqueries.add(subquery); } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java new file mode 100644 index 00000000..a88e4d63 --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import org.junit.jupiter.api.Test; + +class QWordGraphTest { + + @Test + public void testAddConstructor() { + QWordGraph graph = new QWordGraph("hello", "world"); + + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + System.out.println("--"); + graph.addVariant(graph.nodes().get(1), "sup"); + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println("--"); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + + graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println("--"); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + } +} \ No newline at end of file diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index fe93a1f6..4020d6e0 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -32,8 +32,7 @@ public class QueryFactoryTest { queryFactory = new QueryFactory(lm, tfd, - new EnglishDictionary(tfd), - new NGramBloomFilter(lm) + new EnglishDictionary(tfd) ); } From afc047cd271f813a168ec1e8117f797818820468 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:45:23 +0100 Subject: [PATCH 04/47] (control) GUI for exporting segmentation data from a wikipedia zim --- .../executor/client/ExecutorExportClient.java | 14 +++-- .../api/src/main/protobuf/executor-api.proto | 4 ++ code/execution/build.gradle | 1 + .../nu/marginalia/actor/ExecutorActor.java | 1 + .../actor/ExecutorActorControlService.java | 2 + .../task/ExportSegmentationModelActor.java | 55 +++++++++++++++++++ .../execution/ExecutorExportGrpcService.java | 23 ++++++-- .../node/svc/ControlNodeActionsService.java | 11 ++++ .../actions/partial-export-segmentation.hdb | 45 +++++++++++++++ .../templates/control/node/node-actions.hdb | 1 + .../control/node/partial-node-nav.hdb | 1 + 11 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java create mode 100644 code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb diff --git a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java index a3286a1b..e12fa0d3 100644 --- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java +++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java @@ -2,10 +2,7 @@ package nu.marginalia.executor.client; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.functions.execution.api.Empty; -import nu.marginalia.functions.execution.api.ExecutorExportApiGrpc; -import nu.marginalia.functions.execution.api.RpcExportSampleData; -import nu.marginalia.functions.execution.api.RpcFileStorageId; +import nu.marginalia.functions.execution.api.*; import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcMultiNodeChannelPool; import nu.marginalia.service.discovery.property.ServiceKey; @@ -55,6 +52,7 @@ public class ExecutorExportClient { .setFileStorageId(fid.id()) .build()); } + public void exportTermFrequencies(int node, FileStorageId fid) { channelPool.call(ExecutorExportApiBlockingStub::exportTermFrequencies) .forNode(node) @@ -69,6 +67,14 @@ public class ExecutorExportClient { .run(Empty.getDefaultInstance()); } + public void exportSegmentationModel(int node, String path) { + channelPool.call(ExecutorExportApiBlockingStub::exportSegmentationModel) + .forNode(node) + .run(RpcExportSegmentationModel + .newBuilder() + .setSourcePath(path) + .build()); + } } diff --git a/code/execution/api/src/main/protobuf/executor-api.proto b/code/execution/api/src/main/protobuf/executor-api.proto index 31cffe9b..565770ac 100644 --- a/code/execution/api/src/main/protobuf/executor-api.proto +++ b/code/execution/api/src/main/protobuf/executor-api.proto @@ -38,6 +38,7 @@ service ExecutorSideloadApi { service ExecutorExportApi { rpc exportAtags(RpcFileStorageId) returns (Empty) {} + rpc exportSegmentationModel(RpcExportSegmentationModel) returns (Empty) {} rpc exportSampleData(RpcExportSampleData) returns (Empty) {} rpc exportRssFeeds(RpcFileStorageId) returns (Empty) {} rpc exportTermFrequencies(RpcFileStorageId) returns (Empty) {} @@ -61,6 +62,9 @@ message RpcSideloadEncyclopedia { string sourcePath = 1; string baseUrl = 2; } +message RpcExportSegmentationModel { + string sourcePath = 1; +} message RpcSideloadDirtree { string sourcePath = 1; } diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 8c8fd70b..fa455167 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -34,6 +34,7 @@ dependencies { implementation project(':code:libraries:message-queue') implementation project(':code:functions:link-graph:api') + implementation project(':code:functions:search-query') implementation project(':code:execution:api') implementation project(':code:process-models:crawl-spec') diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActor.java b/code/execution/java/nu/marginalia/actor/ExecutorActor.java index ee7fb1d3..d04b3eaa 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActor.java @@ -12,6 +12,7 @@ public enum ExecutorActor { ADJACENCY_CALCULATION, CRAWL_JOB_EXTRACTOR, EXPORT_DATA, + EXPORT_SEGMENTATION_MODEL, EXPORT_ATAGS, EXPORT_TERM_FREQUENCIES, EXPORT_FEEDS, diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java index 53abdfe3..6f37d7ab 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -47,6 +47,7 @@ public class ExecutorActorControlService { ExportFeedsActor exportFeedsActor, ExportSampleDataActor exportSampleDataActor, ExportTermFreqActor exportTermFrequenciesActor, + ExportSegmentationModelActor exportSegmentationModelActor, DownloadSampleActor downloadSampleActor, ExecutorActorStateMachines stateMachines) { this.messageQueueFactory = messageQueueFactory; @@ -76,6 +77,7 @@ public class ExecutorActorControlService { register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor); register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); + register(ExecutorActor.EXPORT_SEGMENTATION_MODEL, exportSegmentationModelActor); register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor); } diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java new file mode 100644 index 00000000..4cc4ca76 --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -0,0 +1,55 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; +import java.time.LocalDateTime; + +@Singleton +public class ExportSegmentationModelActor extends RecordActorPrototype { + + private final FileStorageService storageService; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public record Export(String zimFile) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Export(String zimFile) -> { + + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "segmentation-model", "Segmentation Model Export " + LocalDateTime.now()); + + Path countsFile = storage.asPath().resolve("ngram-counts.bin"); + Path permutationsFile = storage.asPath().resolve("ngram-permutations.bin"); + + NgramExtractorMain.dumpCounts(Path.of(zimFile), countsFile, permutationsFile); + + yield new End(); + } + default -> new Error(); + }; + } + + @Override + public String describe() { + return "Generate a query segmentation model from a ZIM file."; + } + + @Inject + public ExportSegmentationModelActor(Gson gson, + FileStorageService storageService) + { + super(gson); + this.storageService = storageService; + } + +} diff --git a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java index 41c8bb8b..68ad426a 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java @@ -6,12 +6,11 @@ import io.grpc.stub.StreamObserver; import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActorControlService; import nu.marginalia.actor.task.*; -import nu.marginalia.functions.execution.api.Empty; -import nu.marginalia.functions.execution.api.ExecutorExportApiGrpc; -import nu.marginalia.functions.execution.api.RpcExportSampleData; -import nu.marginalia.functions.execution.api.RpcFileStorageId; +import nu.marginalia.functions.execution.api.*; import nu.marginalia.storage.model.FileStorageId; +import java.nio.file.Path; + @Singleton public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase { private final ExecutorActorControlService actorControlService; @@ -92,4 +91,20 @@ public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExp responseObserver.onError(e); } } + + @Override + public void exportSegmentationModel(RpcExportSegmentationModel request, StreamObserver responseObserver) { + try { + actorControlService.startFrom(ExecutorActor.EXPORT_SEGMENTATION_MODEL, + new ExportSegmentationModelActor.Export(request.getSourcePath()) + ); + + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + } + catch (Exception e) { + responseObserver.onError(e); + } + } + } diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index 2ae09234..b711be14 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -76,6 +76,9 @@ public class ControlNodeActionsService { Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange, redirectControl.renderRedirectAcknowledgement("Sideloading", "..") ); + Spark.post("/public/nodes/:node/actions/export-segmentation", this::exportSegmentationModel, + redirectControl.renderRedirectAcknowledgement("Exporting", "..") + ); Spark.post("/public/nodes/:node/actions/download-sample-data", this::downloadSampleData, redirectControl.renderRedirectAcknowledgement("Downloading", "..") ); @@ -307,6 +310,14 @@ public class ControlNodeActionsService { return ""; } + private Object exportSegmentationModel(Request req, Response rsp) { + exportClient.exportSegmentationModel( + Integer.parseInt(req.params("node")), + req.queryParams("source")); + + return ""; + } + private Object exportFromCrawlData(Request req, Response rsp) { String exportType = req.queryParams("exportType"); FileStorageId source = parseSourceFileStorageId(req.queryParams("source")); diff --git a/code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb b/code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb new file mode 100644 index 00000000..2ef9b180 --- /dev/null +++ b/code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb @@ -0,0 +1,45 @@ +

Export segmentation model

+ +
+

This will generate a query segmentation model from a wikipedia ZIM file. A query segmentation model +is used to break a search query into segments corresponding to different concepts. For example, the query +"slackware linux package manager" would be segmented into "slackware linux", and "package manager"; and the +search would be performed putting higher emphasis on "package" and "manager" appearing in the same part of the document +than "linux" and "manager". +

+
+
+
+ + + {{#each uploadDirContents.items}} + + + + + + + {{/each}} + {{#unless uploadDirContents.items}} + + + + {{/unless}} +
FilenameSizeLast Modified
+ + {{#unless directory}}{{size}}{{/unless}}{{shortTimestamp lastModifiedTime}}
Nothing found in upload directory
+ +

+ + The upload directory is typically mounted to /uploads on the server. The external + directory is typically something like index-{{node.id}}/uploads. + +

+ +
+
+ +
+
+
+
\ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/resources/templates/control/node/node-actions.hdb index df8ed77f..7de90949 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-actions.hdb @@ -20,6 +20,7 @@ {{#if view.sideload-warc}} {{> control/node/actions/partial-sideload-warc }} {{/if}} {{#if view.sideload-dirtree}} {{> control/node/actions/partial-sideload-dirtree }} {{/if}} {{#if view.sideload-reddit}} {{> control/node/actions/partial-sideload-reddit }} {{/if}} + {{#if view.export-segmentation}} {{> control/node/actions/partial-export-segmentation }} {{/if}} {{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}} {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} {{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}} diff --git a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb index 23627155..ff16507d 100644 --- a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb @@ -30,6 +30,7 @@
  • Export Database Data
  • Export Sample Crawl Data
  • Export From Crawl Data
  • +
  • Export Segmentation Model
  • Restore Index Backup
  • From d8f4e7d72b912eb9ee2086eb230718500d63524f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 19 Mar 2024 10:33:29 +0100 Subject: [PATCH 05/47] (qs) Retire NGramBloomFilter, integrate new segmentation model instead --- .../java/nu/marginalia/LanguageModels.java | 10 +-- .../config/java/nu/marginalia/WmsaHome.java | 5 +- .../execution/ExecutorExportGrpcService.java | 2 - .../marginalia/util/TestLanguageModels.java | 4 +- .../test/util/TestLanguageModels.java | 4 +- .../query_parser/variant/QueryExpansion.java | 6 +- .../variant/strategy/CombineDashes.java | 40 ----------- .../variant/strategy/JoinTerms.java | 58 ---------------- .../variant/strategy/SplitWordNum.java | 65 ----------------- .../segmentation/NgramLexicon.java | 15 ++++ .../marginalia/util/ngrams/DenseBitMap.java | 69 ------------------- .../util/ngrams/NGramBloomFilter.java | 64 ----------------- .../query/svc/QueryFactoryTest.java | 1 - .../language/filter/TestLanguageModels.java | 4 +- .../converting/util/TestLanguageModels.java | 4 +- .../marginalia/util/TestLanguageModels.java | 4 +- run/setup.sh | 2 +- 17 files changed, 39 insertions(+), 318 deletions(-) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java delete mode 100644 code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java delete mode 100644 code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index 04ab0aa0..ca7fde45 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -3,7 +3,6 @@ package nu.marginalia; import java.nio.file.Path; public class LanguageModels { - public final Path ngramBloomFilter; public final Path termFrequencies; public final Path openNLPSentenceDetectionData; @@ -11,20 +10,21 @@ public class LanguageModels { public final Path posDict; public final Path openNLPTokenData; public final Path fasttextLanguageModel; + public final Path segments; - public LanguageModels(Path ngramBloomFilter, - Path termFrequencies, + public LanguageModels(Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData, - Path fasttextLanguageModel) { - this.ngramBloomFilter = ngramBloomFilter; + Path fasttextLanguageModel, + Path segments) { this.termFrequencies = termFrequencies; this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; this.posRules = posRules; this.posDict = posDict; this.openNLPTokenData = openNLPTokenData; this.fasttextLanguageModel = fasttextLanguageModel; + this.segments = segments; } } diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index b61ee4dd..b5378afc 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -85,13 +85,14 @@ public class WmsaHome { final Path home = getHomePath(); return new LanguageModels( - home.resolve("model/ngrams.bin"), home.resolve("model/tfreq-new-algo3.bin"), home.resolve("model/opennlp-sentence.bin"), home.resolve("model/English.RDR"), home.resolve("model/English.DICT"), home.resolve("model/opennlp-tok.bin"), - home.resolve("model/lid.176.ftz")); + home.resolve("model/lid.176.ftz"), + home.resolve("model/segments.bin") + ); } public static Path getAtagsPath() { diff --git a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java index 68ad426a..3c5a8d5b 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java @@ -9,8 +9,6 @@ import nu.marginalia.actor.task.*; import nu.marginalia.functions.execution.api.*; import nu.marginalia.storage.model.FileStorageId; -import java.nio.file.Path; - @Singleton public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase { private final ExecutorActorControlService actorControlService; diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java b/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java index 5efd2025..a4cc012b 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java +++ b/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java b/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java index 0675559a..d857c048 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java index faac81d4..eac2988d 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -1,6 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser.variant; import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; @@ -15,13 +16,15 @@ public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); private final TermFrequencyDict dict; private final NgramLexicon lexicon; - List expansionStrategies = List.of( + + private final List expansionStrategies = List.of( this::joinDashes, this::splitWordNum, this::joinTerms, this::createSegments ); + @Inject public QueryExpansion(TermFrequencyDict dict, NgramLexicon lexicon ) { @@ -97,6 +100,7 @@ public class QueryExpansion { String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { for (var segment : lexicon.findSegments(length, words)) { int start = segment.start(); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java deleted file mode 100644 index c24defbe..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** Variant strategy that combines word that have dashes, as sometimes lawn-chair - * gets spelled lawnchair */ -public class CombineDashes implements VariantStrategy { - final Pattern dashBoundary = Pattern.compile("-"); - - public CombineDashes() { - } - - @Override - public Collection> constructVariants(List words) { - List asTokens2 = new ArrayList<>(); - boolean dash = false; - - for (var span : words) { - var matcher = dashBoundary.matcher(span.word); - if (matcher.find()) { - String combined = dashBoundary.matcher(span.word).replaceAll(""); - asTokens2.add(combined); - } - - asTokens2.add(span.word); - } - - if (dash) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java deleted file mode 100644 index d03a64d1..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java +++ /dev/null @@ -1,58 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import ca.rmen.porterstemmer.PorterStemmer; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** Variant strategy that merges tokens that are adjacent, where the combined token - * has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */ -public class JoinTerms implements VariantStrategy { - private final TermFrequencyDict dict; - private final PorterStemmer ps; - - public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) { - this.dict = dict; - this.ps = ps; - } - - @Override - public Collection> constructVariants(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = ps.stemWord(a.word + b.word); - - double scoreCombo = dict.getTermFreqStemmed(stemmed); - - if (scoreCombo > 10000) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = a.word + b.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - - } - - return ret; - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java deleted file mode 100644 index ac79476b..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java +++ /dev/null @@ -1,65 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; -import nu.marginalia.util.ngrams.NGramBloomFilter; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** Variant strategy that splits tokens at the boundary between a number and a word. - */ -public class SplitWordNum implements VariantStrategy { - - - final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - private final NGramBloomFilter nGramBloomFilter; - - public SplitWordNum(NGramBloomFilter nGramBloomFilter) { - this.nGramBloomFilter = nGramBloomFilter; - } - - @Override - public Collection> constructVariants(List ls) { - List asTokens2 = new ArrayList<>(); - - boolean num = false; - - for (var span : ls) { - var wordMatcher = numWordBoundary.matcher(span.word); - var stemmedMatcher = numWordBoundary.matcher(span.stemmed); - - int ws = 0; - int ss = 0; - boolean didSplit = false; - while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { - ws = wordMatcher.start()+1; - ss = stemmedMatcher.start()+1; - if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) - || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) - { - String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); - asTokens2.add(combined); - didSplit = true; - num = true; - } - } - - if (!didSplit) { - asTokens2.add(span.word); - } - } - - if (num) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private String splitAtNumBoundary(String in, int splitPoint, String joiner) { - return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java index f8044e12..c4fe69e2 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -1,8 +1,10 @@ package nu.marginalia.functions.searchquery.segmentation; +import com.google.inject.Inject; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import nu.marginalia.LanguageModels; import java.io.DataInputStream; import java.io.DataOutputStream; @@ -24,6 +26,19 @@ public class NgramLexicon { private static final HasherGroup orderedHasher = HasherGroup.ordered(); private static final HasherGroup unorderedHasher = HasherGroup.unordered(); + @Inject + public NgramLexicon(LanguageModels models) { + try { + loadCounts(models.segments); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public NgramLexicon() { + + } + public List findSegments(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); diff --git a/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java b/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java deleted file mode 100644 index 008b17b3..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.util.ngrams; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.BitSet; - -// It's unclear why this exists, we should probably use a BitSet instead? -// Chesterton's fence? -public class DenseBitMap { - public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; - - public final long cardinality; - private final ByteBuffer buffer; - - public DenseBitMap(long cardinality) { - this.cardinality = cardinality; - - boolean misaligned = (cardinality & 7) > 0; - this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); - } - - public static DenseBitMap loadFromFile(Path file) throws IOException { - long size = Files.size(file); - var dbm = new DenseBitMap(size/8); - - try (var bc = Files.newByteChannel(file)) { - while (dbm.buffer.position() < dbm.buffer.capacity()) { - bc.read(dbm.buffer); - } - } - dbm.buffer.clear(); - - return dbm; - } - - public void writeToFile(Path file) throws IOException { - - try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { - while (buffer.position() < buffer.capacity()) { - bc.write(buffer); - } - } - - buffer.clear(); - } - - public boolean get(long pos) { - return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; - } - - /** Set the bit indexed by pos, returns - * its previous value. - */ - public boolean set(long pos) { - int offset = (int) (pos >>> 3); - int oldVal = buffer.get(offset); - int mask = (byte) 1 << (int) (pos & 7); - buffer.put(offset, (byte) (oldVal | mask)); - return (oldVal & mask) != 0; - } - - public void clear(long pos) { - int offset = (int)(pos >>> 3); - buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7)))); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java b/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java deleted file mode 100644 index 3326956d..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java +++ /dev/null @@ -1,64 +0,0 @@ -package nu.marginalia.util.ngrams; - -import ca.rmen.porterstemmer.PorterStemmer; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.google.inject.Inject; -import nu.marginalia.LanguageModels; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.regex.Pattern; - -public class NGramBloomFilter { - private final DenseBitMap bitMap; - private static final PorterStemmer ps = new PorterStemmer(); - private static final HashFunction hasher = Hashing.murmur3_128(0); - - private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class); - - @Inject - public NGramBloomFilter(LanguageModels lm) throws IOException { - this(loadSafely(lm.ngramBloomFilter)); - } - - private static DenseBitMap loadSafely(Path path) throws IOException { - if (Files.isRegularFile(path)) { - return DenseBitMap.loadFromFile(path); - } - else { - logger.warn("NGrams file missing " + path); - return new DenseBitMap(1); - } - } - - public NGramBloomFilter(DenseBitMap bitMap) { - this.bitMap = bitMap; - } - - public boolean isKnownNGram(String word) { - long bit = bitForWord(word, bitMap.cardinality); - - return bitMap.get(bit); - } - - public static NGramBloomFilter load(Path file) throws IOException { - return new NGramBloomFilter(DenseBitMap.loadFromFile(file)); - } - - private static final Pattern underscore = Pattern.compile("_"); - - private static long bitForWord(String s, long n) { - String[] parts = underscore.split(s); - long hc = 0; - for (String part : parts) { - hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong(); - } - return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n; - } - -} diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 4020d6e0..24131143 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -9,7 +9,6 @@ import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.util.language.EnglishDictionary; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; diff --git a/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java b/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java index 2b7bf0e2..cb31942a 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java b/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java index 4ad1e430..f28e1348 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java b/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java index 5efd2025..a4cc012b 100644 --- a/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java +++ b/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/run/setup.sh b/run/setup.sh index 3d9c5f54..3cacca75 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -26,7 +26,7 @@ download_model model/English.DICT https://raw.githubusercontent.com/datquocnguye download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin -download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin +download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz From 0bd3365c2461483dfed5297f86d2548cfdd3060f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 19 Mar 2024 14:28:42 +0100 Subject: [PATCH 06/47] (convert) Initial integration of segmentation data into the converter's keyword extraction logic --- .../java/nu/marginalia/LanguageModels.java | 3 + code/execution/build.gradle | 1 + .../task/ExportSegmentationModelActor.java | 2 +- .../anchor-keywords/build.gradle | 1 + .../atags/DomainAnchorTagsImplTest.java | 1 + .../keyword/DocumentKeywordExtractor.java | 16 ++++- .../extractors/KeywordPositionBitmask.java | 12 +++- .../keyword/DocumentKeywordExtractorTest.java | 25 ++++++- .../keyword/SentenceExtractorTest.java | 10 +-- .../summary/SummaryExtractorTest.java | 5 +- .../query_parser/variant/QueryExpansion.java | 2 +- .../language/model/DocumentSentence.java | 17 ++++- .../language/sentence/SentenceExtractor.java | 68 +++++++++++++++++-- .../term-frequency-dict/build.gradle | 2 + .../segmentation/BasicSentenceExtractor.java | 2 +- .../marginalia}/segmentation/HasherGroup.java | 4 +- .../segmentation/NgramExporterMain.java | 14 ++-- .../segmentation/NgramExtractorMain.java | 2 +- .../segmentation/NgramLexicon.java | 43 +++++++----- .../segmentation/HasherGroupTest.java | 3 +- .../segmentation/NgramLexiconTest.java | 2 +- .../SentenceStatisticsExperiment.java | 5 +- 22 files changed, 192 insertions(+), 48 deletions(-) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/BasicSentenceExtractor.java (88%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/HasherGroup.java (95%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/NgramExporterMain.java (72%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/NgramExtractorMain.java (98%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/NgramLexicon.java (85%) rename code/{functions/search-query/test/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/test/nu/marginalia}/segmentation/HasherGroupTest.java (89%) rename code/{functions/search-query/test/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/test/nu/marginalia}/segmentation/NgramLexiconTest.java (96%) diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index ca7fde45..d1854963 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -1,7 +1,10 @@ package nu.marginalia; +import lombok.Builder; + import java.nio.file.Path; +@Builder public class LanguageModels { public final Path termFrequencies; diff --git a/code/execution/build.gradle b/code/execution/build.gradle index fa455167..57de7320 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -32,6 +32,7 @@ dependencies { implementation project(':third-party:commons-codec') implementation project(':code:libraries:message-queue') + implementation project(':code:libraries:term-frequency-dict') implementation project(':code:functions:link-graph:api') implementation project(':code:functions:search-query') diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java index 4cc4ca76..90baf009 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain; +import nu.marginalia.segmentation.NgramExtractorMain; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle index 23e415b9..3541b5ec 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/features-convert/anchor-keywords/build.gradle @@ -19,6 +19,7 @@ dependencies { implementation project(':code:common:process') implementation project(':code:features-convert:keyword-extraction') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') implementation libs.bundles.slf4j diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java b/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java index ee555ca5..17443c51 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java +++ b/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java @@ -5,6 +5,7 @@ import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.util.TestLanguageModels; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 8feb5fd8..aaad9800 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.keyword; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; @@ -15,11 +16,13 @@ public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; + private final NgramLexicon ngramLexicon; @Inject - public DocumentKeywordExtractor(TermFrequencyDict dict) { + public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) { this.dict = dict; + this.ngramLexicon = ngramLexicon; this.keywordExtractor = new KeywordExtractor(); } @@ -131,6 +134,17 @@ public class DocumentKeywordExtractor { wordsBuilder.add(rep.word, meta); } + + for (int i = 0; i < sent.ngrams.length; i++) { + var ngram = sent.ngrams[i]; + var ngramStemmed = sent.ngramStemmed[i]; + + long meta = metadata.getMetadataForWord(ngramStemmed); + assert meta != 0L : "Missing meta for " + ngram; + + wordsBuilder.add(ngram, meta); + } + } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java index b402c9f6..230c895f 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java @@ -14,7 +14,9 @@ public class KeywordPositionBitmask { private static final int unmodulatedPortion = 16; @Inject - public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) { + public KeywordPositionBitmask(KeywordExtractor keywordExtractor, + DocumentLanguageData dld) + { // Mark the title words as position 0 for (var sent : dld.titleSentences) { @@ -24,6 +26,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var ngram : sent.ngramStemmed) { + positionMask.merge(ngram, posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } @@ -43,6 +49,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var ngram : sent.ngramStemmed) { + positionMask.merge(ngram, posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 8a4f3b6b..54577f80 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; @@ -20,7 +21,9 @@ import java.util.Set; class DocumentKeywordExtractorTest { - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); @Test @@ -56,6 +59,22 @@ class DocumentKeywordExtractorTest { } + @Test + public void testKeyboards2() throws IOException, URISyntaxException { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0.5)); + + var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); + + keywords.getWords().forEach((k, v) -> { + if (k.contains("_")) { + System.out.println(k + " " + new WordMetadata(v)); + } + }); + } @Test public void testKeyboards() throws IOException, URISyntaxException { var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), @@ -119,7 +138,9 @@ class DocumentKeywordExtractorTest { var doc = Jsoup.parse(html); doc.filter(new DomPruningFilter(0.5)); - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index dabad6d1..bfc78a9c 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -3,6 +3,7 @@ package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; @@ -20,9 +21,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("slow") class SentenceExtractorTest { - final LanguageModels lm = TestLanguageModels.getLanguageModels(); + static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - SentenceExtractor se = new SentenceExtractor(lm); + static NgramLexicon ngramLexicon = new NgramLexicon(lm); + static SentenceExtractor se = new SentenceExtractor(lm); @SneakyThrows public static void main(String... args) throws IOException { @@ -32,11 +34,9 @@ class SentenceExtractorTest { System.out.println("Running"); - SentenceExtractor se = new SentenceExtractor(lm); - var dict = new TermFrequencyDict(lm); var url = new EdgeUrl("https://memex.marginalia.nu/"); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon); for (;;) { long total = 0; diff --git a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java index c1a326da..cabe558f 100644 --- a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.summary.heuristic.*; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; @@ -25,7 +26,9 @@ class SummaryExtractorTest { @BeforeEach public void setUp() { - keywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + keywordExtractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); summaryExtractor = new SummaryExtractor(255, diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java index eac2988d..820a9022 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -4,7 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; -import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java index ef5bc0a9..b9b4abce 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java @@ -16,12 +16,24 @@ public class DocumentSentence implements Iterable{ public final String[] wordsLowerCase; public final String[] posTags; public final String[] stemmedWords; + public final String[] ngrams; + public final String[] ngramStemmed; private final BitSet isStopWord; + public SoftReference keywords; - public DocumentSentence(String originalSentence, String[] words, int[] separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords) { + public DocumentSentence(String originalSentence, + String[] words, + int[] separators, + String[] wordsLowerCase, + String[] posTags, + String[] stemmedWords, + String[] ngrams, + String[] ngramsStemmed + ) + { this.originalSentence = originalSentence; this.words = words; this.separators = separators; @@ -31,6 +43,9 @@ public class DocumentSentence implements Iterable{ isStopWord = new BitSet(words.length); + this.ngrams = ngrams; + this.ngramStemmed = ngramsStemmed; + for (int i = 0; i < words.length; i++) { if (WordPatterns.isStopWord(words[i])) isStopWord.set(i); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index 13ba2e76..fd15660f 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -4,6 +4,7 @@ import com.github.datquocnguyen.RDRPOSTagger; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import opennlp.tools.sentdetect.SentenceDetectorME; @@ -32,6 +33,8 @@ public class SentenceExtractor { private SentenceDetectorME sentenceDetector; private static RDRPOSTagger rdrposTagger; + private static NgramLexicon ngramLexicon = null; + private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); @@ -45,7 +48,8 @@ public class SentenceExtractor { private static final int MAX_TEXT_LENGTH = 65536; @SneakyThrows @Inject - public SentenceExtractor(LanguageModels models) { + public SentenceExtractor(LanguageModels models) + { try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) { var sentenceModel = new SentenceModel(modelIn); sentenceDetector = new SentenceDetectorME(sentenceModel); @@ -55,7 +59,9 @@ public class SentenceExtractor { logger.error("Could not initialize sentence detector", ex); } - synchronized (RDRPOSTagger.class) { + synchronized (this) { + ngramLexicon = new NgramLexicon(models); + try { rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); } @@ -128,8 +134,34 @@ public class SentenceExtractor { var seps = wordsAndSeps.separators; var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); + List ngrams = ngramLexicon.findSegmentsStrings(2, 12, words); + + String[] ngramsWords = new String[ngrams.size()]; + String[] ngramsStemmedWords = new String[ngrams.size()]; + for (int i = 0; i < ngrams.size(); i++) { + String[] ngram = ngrams.get(i); + + StringJoiner ngramJoiner = new StringJoiner("_"); + StringJoiner stemmedJoiner = new StringJoiner("_"); + for (String s : ngram) { + ngramJoiner.add(s); + stemmedJoiner.add(porterStemmer.stem(s)); + } + + ngramsWords[i] = ngramJoiner.toString(); + ngramsStemmedWords[i] = stemmedJoiner.toString(); + } + + return new DocumentSentence( - SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) + SentenceExtractorStringUtils.sanitizeString(text), + words, + seps, + lc, + rdrposTagger.tagsForEnSentence(words), + stemSentence(lc), + ngramsWords, + ngramsStemmedWords ); } @@ -195,7 +227,35 @@ public class SentenceExtractor { fullString = ""; } - ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); + List ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]); + + String[] ngramsWords = new String[ngrams.size()]; + String[] ngramsStemmedWords = new String[ngrams.size()]; + + for (int j = 0; j < ngrams.size(); j++) { + String[] ngram = ngrams.get(j); + + StringJoiner ngramJoiner = new StringJoiner("_"); + StringJoiner stemmedJoiner = new StringJoiner("_"); + for (String s : ngram) { + ngramJoiner.add(s); + stemmedJoiner.add(porterStemmer.stem(s)); + } + + ngramsWords[j] = ngramJoiner.toString(); + ngramsStemmedWords[j] = stemmedJoiner.toString(); + } + + + ret[i] = new DocumentSentence(fullString, + tokens[i], + separators[i], + tokensLc[i], + posTags[i], + stemmedWords[i], + ngramsWords, + ngramsStemmedWords + ); } return ret; } diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 0fe311b6..4d7e42c5 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -16,6 +16,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':third-party:rdrpostagger') implementation project(':third-party:porterstemmer') + implementation project(':third-party:commons-codec') + implementation project(':third-party:openzim') implementation project(':third-party:monkey-patch-opennlp') implementation project(':code:common:model') implementation project(':code:common:config') diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java similarity index 88% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java index e65c243d..cee48910 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import ca.rmen.porterstemmer.PorterStemmer; import org.apache.commons.lang3.StringUtils; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java similarity index 95% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java index 60bbb4dd..2a452f75 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java @@ -1,11 +1,11 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import nu.marginalia.hash.MurmurHash3_128; /** A group of hash functions that can be used to hash a sequence of strings, * that also has an inverse operation that can be used to remove a previously applied * string from the sequence. */ -sealed interface HasherGroup { +public sealed interface HasherGroup { /** Apply a hash to the accumulator */ long apply(long acc, long add); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java similarity index 72% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java index 087345f6..ee6d2cd5 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java @@ -1,7 +1,6 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; -import nu.marginalia.WmsaHome; -import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.LanguageModels; import java.io.IOException; import java.nio.file.Path; @@ -15,10 +14,11 @@ public class NgramExporterMain { } static void trial() throws IOException { - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - - NgramLexicon lexicon = new NgramLexicon(); - lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin")); + NgramLexicon lexicon = new NgramLexicon( + LanguageModels.builder() + .segments(Path.of("/home/vlofgren/ngram-counts.bin")) + .build() + ); System.out.println("Loaded!"); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java similarity index 98% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 4cd4b296..577aee6e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.hash.MurmurHash3_128; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java similarity index 85% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index c4fe69e2..91cee314 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -1,11 +1,13 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import com.google.inject.Inject; +import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.LanguageModels; +import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; @@ -16,11 +18,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +@Singleton public class NgramLexicon { - private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( - 100_000_000, - new KeyIsAlreadyHashStrategy() - ); + private final Long2IntOpenCustomHashMap counts; private final LongOpenHashSet permutations = new LongOpenHashSet(); private static final HasherGroup orderedHasher = HasherGroup.ordered(); @@ -28,17 +28,35 @@ public class NgramLexicon { @Inject public NgramLexicon(LanguageModels models) { - try { - loadCounts(models.segments); + try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(models.segments)))) { + long size = dis.readInt(); + counts = new Long2IntOpenCustomHashMap( + (int) size, + new KeyIsAlreadyHashStrategy() + ); + + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } } catch (IOException e) { throw new RuntimeException(e); } } public NgramLexicon() { - + counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); } + public List findSegmentsStrings(int minLength, int maxLength, String... parts) { + List segments = new ArrayList<>(); + + for (int i = minLength; i <= maxLength; i++) { + segments.addAll(findSegments(i, parts)); + } + + return segments.stream().map(seg -> seg.project(parts)).toList(); + } + public List findSegments(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); @@ -96,15 +114,6 @@ public class NgramLexicon { permutations.add(hashUnordered); } - public void loadCounts(Path path) throws IOException { - try (var dis = new DataInputStream(Files.newInputStream(path))) { - long size = dis.readInt(); - - for (int i = 0; i < size; i++) { - counts.put(dis.readLong(), dis.readInt()); - } - } - } public void loadPermutations(Path path) throws IOException { try (var dis = new DataInputStream(Files.newInputStream(path))) { diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java similarity index 89% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java rename to code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java index 174bd553..110b1b9b 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; +import nu.marginalia.segmentation.HasherGroup; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java similarity index 96% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java rename to code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index 28b9ef2f..d5065959 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 8614d1e6..dde7a106 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; @@ -21,8 +22,10 @@ import java.nio.file.Path; public class SentenceStatisticsExperiment extends LegacyExperiment { + NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon); Path filename; PrintWriter writer; From a4b810f51102b9f20d0c7c85b5e5f82bc4fbe5c2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Mar 2024 12:00:52 +0100 Subject: [PATCH 07/47] WIP --- .../query_parser/ExpansionStrategy.java | 7 + .../{variant => }/QueryExpansion.java | 8 +- .../query_parser/QueryPermutation.java | 229 ------------------ .../query_parser/QueryVariants.java | 207 ---------------- .../{variant => }/model/QWord.java | 2 +- .../{variant => }/model/QWordGraph.java | 46 +++- .../variant/ExpansionStrategy.java | 7 - .../query_parser/variant/QueryVariant.java | 17 -- .../query_parser/variant/QueryVariantSet.java | 21 -- .../query_parser/variant/QueryWord.java | 10 - .../query_parser/variant/VariantStrategy.java | 8 - .../searchquery/svc/QueryFactory.java | 50 +--- .../{variant => }/model/QWordGraphTest.java | 6 +- 13 files changed, 68 insertions(+), 550 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/QueryExpansion.java (93%) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWord.java (94%) rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWordGraph.java (82%) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java rename code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWordGraphTest.java (83%) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java new file mode 100644 index 00000000..20ebffd1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.functions.searchquery.query_parser; + +import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; + +public interface ExpansionStrategy { + void expand(QWordGraph graph); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java similarity index 93% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 820a9022..c216918e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -1,9 +1,9 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; +package nu.marginalia.functions.searchquery.query_parser; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; +import nu.marginalia.functions.searchquery.query_parser.model.QWord; +import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; @@ -40,7 +40,7 @@ public class QueryExpansion { strategy.expand(graph); } - return null; + return graph; } private static final Pattern dashPattern = Pattern.compile("-"); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java deleted file mode 100644 index 417ceda3..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java +++ /dev/null @@ -1,229 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.language.WordPatterns; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.function.Predicate; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static java.util.stream.Stream.concat; - -public class QueryPermutation { - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final QueryVariants queryVariants; - - public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); - public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?"); - - public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); - - public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); - public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); - - public QueryPermutation(QueryVariants queryVariants) { - this.queryVariants = queryVariants; - } - - public List> permuteQueries(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start > 1) { - List> permuteParts = combineSearchTerms(items.subList(start, end)); - int s = start; - int e = end; - return permuteParts.stream().map(part -> - concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream())) - .collect(Collectors.toList())) - .peek(lst -> lst.removeIf(this::isJunkWord)) - .limit(24) - .collect(Collectors.toList()); - } - else { - return List.of(items); - } - } - - - public List> permuteQueriesNew(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start >= 1) { - var result = queryVariants.getQueryVariants(items.subList(start, end)); - - logger.debug("{}", result); - - if (result.isEmpty()) { - logger.warn("Empty variants result, falling back on old code"); - return permuteQueries(items); - } - - List> queryVariants = new ArrayList<>(); - for (var query : result.faithful) { - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - for (var query : result.alternative) { - if (queryVariants.size() >= 6) - break; - - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - - List> returnValue = new ArrayList<>(queryVariants.size()); - for (var variant: queryVariants) { - List r = new ArrayList<>(start + variant.size() + (items.size() - end)); - r.addAll(items.subList(0, start)); - r.addAll(variant); - r.addAll(items.subList(end, items.size())); - returnValue.add(r); - } - - return returnValue; - - } - else { - return List.of(items); - } - } - - private boolean isJunkWord(Token token) { - if (WordPatterns.isStopWord(token.str) && - !token.str.matches("^(\\d+|([a-z]+:.*))$")) { - return true; - } - return switch (token.str) { - case "vs", "versus", "or", "and" -> true; - default -> false; - }; - } - - private List> combineSearchTerms(List subList) { - int size = subList.size(); - if (size < 1) { - return Collections.emptyList(); - } - else if (size == 1) { - if (WordPatterns.isStopWord(subList.get(0).str)) { - return Collections.emptyList(); - } - return List.of(subList); - } - - List> results = new ArrayList<>(size*(size+1)/2); - - if (subList.size() <= 4 && subList.get(0).str.length() >= 2 && !isPrefixWord(subList.get(subList.size()-1).str)) { - results.add(List.of(joinTokens(subList))); - } - outer: for (int i = size - 1; i >= 1; i--) { - - var left = combineSearchTerms(subList.subList(0, i)); - var right = combineSearchTerms(subList.subList(i, size)); - - for (var l : left) { - if (results.size() > 48) { - break outer; - } - - for (var r : right) { - if (results.size() > 48) { - break outer; - } - - List combined = new ArrayList<>(l.size() + r.size()); - combined.addAll(l); - combined.addAll(r); - if (!results.contains(combined)) { - results.add(combined); - } - } - } - } - if (!results.contains(subList)) { - results.add(subList); - } - Comparator> tc = (o1, o2) -> { - int dJoininess = o2.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum(); - if (dJoininess == 0) { - return (o2.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum()); - } - return (int) Math.signum(dJoininess); - }; - results.sort(tc); - return results; - } - - private boolean isPrefixWord(String str) { - return switch (str) { - case "the", "of", "when" -> true; - default -> false; - }; - } - - int joininess(String s) { - return (int) s.chars().filter(c -> c == '_').count(); - } - int rightiness(String s) { - int rightiness = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == '_') { - rightiness+=i; - } - } - return rightiness; - } - - private Token joinTokens(List subList) { - return new Token(TokenType.LITERAL_TERM, - subList.stream().map(t -> t.str).collect(Collectors.joining("_")), - subList.stream().map(t -> t.str).collect(Collectors.joining(" "))); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java deleted file mode 100644 index 10648486..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java +++ /dev/null @@ -1,207 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.util.language.EnglishDictionary; -import nu.marginalia.LanguageModels; -import nu.marginalia.keyword.KeywordExtractor; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import nu.marginalia.language.model.DocumentSentence; -import nu.marginalia.language.model.WordSpan; - -import java.util.*; -import java.util.regex.Pattern; - -public class QueryVariants { - private final KeywordExtractor keywordExtractor; - private final TermFrequencyDict dict; - - private final EnglishDictionary englishDictionary; - private final ThreadLocal sentenceExtractor; - - public QueryVariants(LanguageModels lm, - TermFrequencyDict dict, - EnglishDictionary englishDictionary) { - this.englishDictionary = englishDictionary; - this.keywordExtractor = new KeywordExtractor(); - this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); - this.dict = dict; - } - - - - public QueryVariantSet getQueryVariants(List query) { - final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); - - final TreeMap> byStart = new TreeMap<>(); - - var se = sentenceExtractor.get(); - var sentence = se.extractSentence(joinedQuery.joinedQuery); - - for (int i = 0; i < sentence.posTags.length; i++) { - if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) { - sentence.posTags[i] = "NNP"; - } - else if ("JJ".equals(sentence.posTags[i]) || "CD".equals(sentence.posTags[i]) || sentence.posTags[i].startsWith("P")) { - sentence.posTags[i] = "NNP"; - sentence.setIsStopWord(i, false); - } - } - - for (var kw : keywordExtractor.getKeywordsFromSentence(sentence)) { - byStart.computeIfAbsent(kw.start, k -> new ArrayList<>()).add(kw); - } - - final List> livingSpans = new ArrayList<>(); - - var first = byStart.firstEntry(); - if (first == null) { - var span = new WordSpan(0, sentence.length()); - byStart.put(0, List.of(span)); - } - else if (first.getKey() > 0) { - List elongatedFirstWords = new ArrayList<>(first.getValue().size()); - - first.getValue().forEach(span -> { - elongatedFirstWords.add(new WordSpan(0, span.start)); - elongatedFirstWords.add(new WordSpan(0, span.end)); - }); - - byStart.put(0, elongatedFirstWords); - } - - final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); - - List> faithfulQueries = new ArrayList<>(); - List> alternativeQueries = new ArrayList<>(); - - for (var ls : goodSpans) { - var last = ls.get(ls.size() - 1); - - if (!last.wordOriginal.isBlank() && !Character.isUpperCase(last.wordOriginal.charAt(0))) { - var altLast = englishDictionary.getWordVariants(last.word); - for (String s : altLast) { - List newList = new ArrayList<>(ls.size()); - for (int i = 0; i < ls.size() - 1; i++) { - newList.add(ls.get(i).word); - } - newList.add(s); - alternativeQueries.add(newList); - } - } - - } - - QueryVariantSet returnValue = new QueryVariantSet(); - - returnValue.faithful.addAll(evaluateQueries(faithfulQueries)); - returnValue.alternative.addAll(evaluateQueries(alternativeQueries)); - - returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); - returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue)); - - returnValue.nonLiterals.addAll(joinedQuery.nonLiterals); - - return returnValue; - } - - final Pattern underscore = Pattern.compile("_"); - - private List evaluateQueries(List> queryStrings) { - Set variantsSet = new HashSet<>(); - List ret = new ArrayList<>(); - for (var lst : queryStrings) { - double q = 0; - for (var word : lst) { - String[] parts = underscore.split(word); - double qp = 0; - for (String part : parts) { - qp += 1./(1+ dict.getTermFreq(part)); - } - q += 1.0 / qp; - } - var qv = new QueryVariant(lst, q); - if (variantsSet.add(qv)) { - ret.add(qv); - } - } - return ret; - } - - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { - List> goodSpans = new ArrayList<>(); - for (int i = 0; i < 1; i++) { - var spans = byStart.get(i); - - - if (spans == null ) - continue; - - for (var span : spans) { - ArrayList fragment = new ArrayList<>(); - fragment.add(span); - livingSpans.add(fragment); - } - - if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) break; - } - - - while (!livingSpans.isEmpty()) { - - final List> newLivingSpans = new ArrayList<>(livingSpans.size()); - - for (var span : livingSpans) { - int end = span.get(span.size()-1).end; - - if (end == sentence.length()) { - var gs = new ArrayList(span.size()); - for (var s : span) { - gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), - s.size() == 1 ? sentence.words[s.start] : "")); - } - goodSpans.add(gs); - } - var nextWordsKey = byStart.ceilingKey(end); - - if (null == nextWordsKey) - continue; - - for (var next : byStart.get(nextWordsKey)) { - var newSpan = new ArrayList(span.size() + 1); - newSpan.addAll(span); - newSpan.add(next); - newLivingSpans.add(newSpan); - } - } - - livingSpans.clear(); - livingSpans.addAll(newLivingSpans); - } - - return goodSpans; - } - - - private JoinedQueryAndNonLiteralTokens joinQuery(List query) { - StringJoiner s = new StringJoiner(" "); - List leftovers = new ArrayList<>(5); - - for (var t : query) { - if (t.type == TokenType.LITERAL_TERM) { - s.add(t.displayStr); - } - else { - leftovers.add(t); - } - } - - return new JoinedQueryAndNonLiteralTokens(s.toString(), leftovers); - } - - record JoinedQueryAndNonLiteralTokens(String joinedQuery, List nonLiterals) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java similarity index 94% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java index 07f65c95..b7c4e594 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import ca.rmen.porterstemmer.PorterStemmer; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java similarity index 82% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index f9902733..474c4788 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import org.jetbrains.annotations.NotNull; @@ -15,8 +15,7 @@ import java.util.stream.Stream; public class QWordGraph implements Iterable { - public record QWordGraphLink(QWord from, QWord to) { - } + public record QWordGraphLink(QWord from, QWord to) {} private final List links = new ArrayList<>(); private final Map> fromTo = new HashMap<>(); @@ -121,8 +120,6 @@ public class QWordGraph implements Iterable { // understanding which vertexes can be re-ordered without changing // the semantics of the encoded query. public boolean isBypassed(QWord word, QWord begin, QWord end) { - assert word.isOriginal() : "Can only bypass original words"; - Set edge = new HashSet<>(); Set visited = new HashSet<>(); @@ -163,6 +160,7 @@ public class QWordGraph implements Iterable { List edge = new ArrayList<>(); List visited = new ArrayList<>(); + visited.add(begin); edge.add(begin); while (!edge.isEmpty()) { @@ -172,7 +170,9 @@ public class QWordGraph implements Iterable { if (Objects.equals(w, end)) continue; - assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex"; + if (w.isEnd()) { + assert end.isEnd() : "Graph has a path beyond the specified end vertex " + end; + } next.addAll(getNext(w)); } @@ -182,7 +182,7 @@ public class QWordGraph implements Iterable { edge = next; } - return visited; + return visited.stream().distinct().toList(); } /** Returns a list of subgraphs that are connected on the path from @@ -201,7 +201,7 @@ public class QWordGraph implements Iterable { List points = nodesBetween(begin, end) .stream() - .filter(w -> isBypassed(w, begin, end)) + .filter(w -> !isBypassed(w, begin, end)) .toList(); for (int i = 0; i < points.size() - 1; i++) { @@ -214,6 +214,36 @@ public class QWordGraph implements Iterable { return subgraphs; } + public String compileToQuery() { + return compileToQuery(QWord.beg(), QWord.end()); + } + + public String compileToQuery(QWord begin, QWord end) { + StringJoiner sj = new StringJoiner(" "); + + for (var subgraph : getSubgraphs(begin, end)) { + if (getNext(subgraph.from).equals(List.of(subgraph.to))) { + if (subgraph.from.isBeg()) + continue; + + sj.add(subgraph.from.word()); + } + else { + StringJoiner branchJoiner = new StringJoiner(" | ", "( ", " )"); + if (Objects.equals(subgraph.from, begin)) { + for (QWord path : getNext(subgraph.from)) { + branchJoiner.add(compileToQuery(path, subgraph.to)); + } + } + else { + branchJoiner.add(compileToQuery(subgraph.from, subgraph.to)); + } + sj.add(branchJoiner.toString()); + } + } + + return sj.toString(); + } @NotNull @Override diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java deleted file mode 100644 index 18987aea..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; - -public interface ExpansionStrategy { - void expand(QWordGraph graph); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java deleted file mode 100644 index 8d24387b..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; - -import java.util.List; - -@AllArgsConstructor -@Getter -@ToString -@EqualsAndHashCode -public class QueryVariant { - public final List terms; - public final double value; -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java deleted file mode 100644 index b01fbd5e..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.functions.searchquery.query_parser.token.Token; - -import java.util.ArrayList; -import java.util.List; - -@Getter -@ToString -public class QueryVariantSet { - public final List faithful = new ArrayList<>(); - public final List alternative = new ArrayList<>(); - - public final List nonLiterals = new ArrayList<>(); - - public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java deleted file mode 100644 index 9c158a43..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.AllArgsConstructor; - -@AllArgsConstructor -public class QueryWord { - public final String stemmed; - public final String word; - public final String wordOriginal; -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java deleted file mode 100644 index 2c1a5bfb..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import java.util.Collection; -import java.util.List; - -public interface VariantStrategy { - Collection> constructVariants(List ls); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 9ac7c795..3c0e5219 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -11,8 +11,6 @@ import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; -import nu.marginalia.functions.searchquery.query_parser.QueryPermutation; -import nu.marginalia.functions.searchquery.query_parser.QueryVariants; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; import nu.marginalia.term_frequency_dict.TermFrequencyDict; @@ -29,43 +27,19 @@ public class QueryFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); private static final int RETAIN_QUERY_VARIANT_COUNT = 5; - private final ThreadLocal queryVariants; private final QueryParser queryParser = new QueryParser(); @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, - EnglishDictionary englishDictionary) { - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary)); + EnglishDictionary englishDictionary) + { } - public QueryPermutation getQueryPermutation() { - return new QueryPermutation(queryVariants.get()); - } public ProcessedQuery createQuery(QueryParams params) { - final var processedQuery = createQuery(getQueryPermutation(), params); - final List subqueries = processedQuery.specs.subqueries; - - // There used to be a piece of logic here that would try to figure out which one of these subqueries were the "best", - // it's gone for the moment, but it would be neat if it resurrected somehow - - trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT); - - return processedQuery; - } - - private void trimArray(List arr, int maxSize) { - if (arr.size() > maxSize) { - arr.subList(0, arr.size() - maxSize).clear(); - } - } - - public ProcessedQuery createQuery(QueryPermutation queryPermutation, - QueryParams params) - { final var query = params.humanQuery(); if (query.length() > 1000) { @@ -100,17 +74,19 @@ public class QueryFactory { t.visit(qualityLimits); } - var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); +// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); List subqueries = new ArrayList<>(); + QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); + domain = termsAccumulator.domain; - for (var parts : queryPermutations) { - QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts); - - domain = termsAccumulator.domain; - - SearchSubquery subquery = termsAccumulator.createSubquery(); - subqueries.add(subquery); - } +// for (var parts : queryPermutations) { +// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); +// +// domain = termsAccumulator.domain; +// +// SearchSubquery subquery = termsAccumulator.createSubquery(); +// subqueries.add(subquery); +// } List domainIds = params.domainIds(); diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java similarity index 83% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java rename to code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index a88e4d63..bd16b3cb 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import org.junit.jupiter.api.Test; @@ -10,11 +10,13 @@ class QWordGraphTest { System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println(graph.compileToQuery()); graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); System.out.println("--"); graph.addVariant(graph.nodes().get(1), "sup"); + System.out.println(graph.compileToQuery()); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); @@ -23,6 +25,8 @@ class QWordGraphTest { graph.nodes().forEach(System.out::println); graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); + graph.addVariant(graph.nodes().get(2), "globe"); + System.out.println(graph.compileToQuery()); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); From 411b3f3138355f3d35151fa2f4313cc197da80e3 Mon Sep 17 00:00:00 2001 From: Your Name Date: Sun, 17 Mar 2024 15:32:04 -0400 Subject: [PATCH 08/47] (run/install.sh) fix docker compose file I was following the release demo video for v2024.01.0 https://www.youtube.com/watch?v=PNwMkenQQ24 and when I did 'docker compose up' the containers couldn't resolve the DNS name for 'zookeeper' I realized this was because the zookeeper container was using the default docker network, so I specified the wmsa network explicitly. --- run/install/docker-compose-barebones-1.yml.template | 2 ++ run/install/docker-compose-barebones-2.yml.template | 2 ++ 2 files changed, 4 insertions(+) diff --git a/run/install/docker-compose-barebones-1.yml.template b/run/install/docker-compose-barebones-1.yml.template index 8b36d64e..27a13a0a 100644 --- a/run/install/docker-compose-barebones-1.yml.template +++ b/run/install/docker-compose-barebones-1.yml.template @@ -94,6 +94,8 @@ services: restart: always ports: - "127.0.0.1:2181:2181" + networks: + - wmsa traefik: image: "traefik:v2.10" container_name: "traefik" diff --git a/run/install/docker-compose-barebones-2.yml.template b/run/install/docker-compose-barebones-2.yml.template index 6232cf01..b47d5b00 100644 --- a/run/install/docker-compose-barebones-2.yml.template +++ b/run/install/docker-compose-barebones-2.yml.template @@ -122,6 +122,8 @@ services: restart: always ports: - "127.0.0.1:2181:2181" + networks: + - wmsa traefik: image: "traefik:v2.10" container_name: "traefik" From 002afca1c597454ec1ffb11f8f14e8f5e353aca4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Mar 2024 12:09:24 +0100 Subject: [PATCH 09/47] (sys) Upgrade to JDK22 This also entails upgrading JIB to 3.4.1 and Lombok to 1.18.32. --- build.gradle | 6 +++--- code/common/config/build.gradle | 2 +- code/common/db/build.gradle | 2 +- code/common/linkdb/build.gradle | 2 +- code/common/model/build.gradle | 2 +- code/common/process/build.gradle | 2 +- code/common/renderer/build.gradle | 2 +- code/common/service/build.gradle | 2 +- code/execution/api/build.gradle | 2 +- code/execution/build.gradle | 2 +- code/features-convert/adblock/build.gradle | 2 +- code/features-convert/anchor-keywords/build.gradle | 2 +- code/features-convert/data-extractors/build.gradle | 2 +- code/features-convert/keyword-extraction/build.gradle | 2 +- code/features-convert/pubdate/build.gradle | 2 +- code/features-convert/reddit-json/build.gradle | 2 +- code/features-convert/stackexchange-xml/build.gradle | 2 +- code/features-convert/summary-extraction/build.gradle | 2 +- code/features-convert/topic-detection/build.gradle | 2 +- code/features-crawl/content-type/build.gradle | 2 +- code/features-crawl/crawl-blocklist/build.gradle | 2 +- code/features-crawl/link-parser/build.gradle | 2 +- code/features-search/feedlot-client/build.gradle | 2 +- code/features-search/random-websites/build.gradle | 2 +- code/features-search/screenshots/build.gradle | 2 +- code/functions/domain-info/api/build.gradle | 2 +- code/functions/domain-info/build.gradle | 2 +- code/functions/link-graph/aggregate/build.gradle | 2 +- code/functions/link-graph/api/build.gradle | 2 +- code/functions/link-graph/partition/build.gradle | 2 +- code/functions/math/api/build.gradle | 2 +- code/functions/math/build.gradle | 2 +- code/functions/search-query/api/build.gradle | 2 +- code/functions/search-query/build.gradle | 2 +- code/index/api/build.gradle | 2 +- code/index/build.gradle | 2 +- code/index/index-forward/build.gradle | 2 +- code/index/index-journal/build.gradle | 2 +- code/index/index-reverse/build.gradle | 2 +- code/index/query/build.gradle | 2 +- code/libraries/array/build.gradle | 4 ++-- code/libraries/big-string/build.gradle | 2 +- code/libraries/blocking-thread-pool/build.gradle | 2 +- code/libraries/braille-block-punch-cards/build.gradle | 2 +- code/libraries/btree/build.gradle | 2 +- code/libraries/easy-lsh/build.gradle | 2 +- code/libraries/geo-ip/build.gradle | 2 +- code/libraries/guarded-regex/build.gradle | 2 +- code/libraries/language-processing/build.gradle | 2 +- code/libraries/message-queue/build.gradle | 2 +- code/libraries/next-prime/build.gradle | 2 +- code/libraries/random-write-funnel/build.gradle | 2 +- code/libraries/term-frequency-dict/build.gradle | 2 +- code/libraries/test-helpers/build.gradle | 2 +- code/process-models/crawl-spec/build.gradle | 2 +- code/process-models/crawling-model/build.gradle | 2 +- code/process-models/processed-data/build.gradle | 2 +- code/process-models/work-log/build.gradle | 2 +- code/process-mqapi/build.gradle | 2 +- code/processes/converting-process/build.gradle | 2 +- code/processes/crawling-process/build.gradle | 2 +- code/processes/index-constructor-process/build.gradle | 2 +- code/processes/loading-process/build.gradle | 2 +- code/processes/test-data/build.gradle | 2 +- code/processes/website-adjacencies-calculator/build.gradle | 2 +- code/services-application/api-service/build.gradle | 4 ++-- code/services-application/dating-service/build.gradle | 4 ++-- code/services-application/explorer-service/build.gradle | 4 ++-- code/services-application/search-service/build.gradle | 4 ++-- code/services-core/assistant-service/build.gradle | 4 ++-- code/services-core/control-service/build.gradle | 4 ++-- code/services-core/executor-service/build.gradle | 4 ++-- code/services-core/index-service/build.gradle | 4 ++-- code/services-core/query-service/build.gradle | 4 ++-- code/tools/crawl-data-unfcker/build.gradle | 2 +- code/tools/experiment-runner/build.gradle | 2 +- code/tools/load-test/build.gradle | 2 +- code/tools/screenshot-capture-tool/build.gradle | 4 ++-- settings.gradle | 2 +- third-party/commons-codec/build.gradle | 2 +- third-party/count-min-sketch/build.gradle | 2 +- third-party/encyclopedia-marginalia-nu/build.gradle | 2 +- third-party/monkey-patch-opennlp/build.gradle | 2 +- third-party/openzim/build.gradle | 2 +- third-party/parquet-floor/build.gradle | 2 +- third-party/porterstemmer/build.gradle | 2 +- third-party/rdrpostagger/build.gradle | 2 +- third-party/symspell/build.gradle | 2 +- 88 files changed, 101 insertions(+), 101 deletions(-) diff --git a/build.gradle b/build.gradle index 8b76efda..9559cfc2 100644 --- a/build.gradle +++ b/build.gradle @@ -6,7 +6,7 @@ plugins { // This is a workaround for a bug in the Jib plugin that causes it to stall randomly // https://github.com/GoogleContainerTools/jib/issues/3347 - id 'com.google.cloud.tools.jib' version '3.4.0' apply(false) + id 'com.google.cloud.tools.jib' version '3.4.1' apply(false) } group 'marginalia' @@ -43,7 +43,7 @@ subprojects.forEach {it -> } ext { - dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b' + dockerImageBase='container-registry.oracle.com/graalvm/jdk:22@sha256:22d2ca0d4fb378f50306ec2fda3178cce4523c4fe64e869108571c3c6e7026c8\n' dockerImageTag='latest' dockerImageRegistry='marginalia' } @@ -66,7 +66,7 @@ idea { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/common/config/build.gradle b/code/common/config/build.gradle index 0ceb00ae..74fdf702 100644 --- a/code/common/config/build.gradle +++ b/code/common/config/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index 85ab7dba..7e85c3de 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -17,7 +17,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/common/linkdb/build.gradle b/code/common/linkdb/build.gradle index 811aa577..14c7f056 100644 --- a/code/common/linkdb/build.gradle +++ b/code/common/linkdb/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/common/model/build.gradle b/code/common/model/build.gradle index 6d27b375..1e6c7566 100644 --- a/code/common/model/build.gradle +++ b/code/common/model/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle index 7fcff1ab..908bfae1 100644 --- a/code/common/process/build.gradle +++ b/code/common/process/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/common/renderer/build.gradle b/code/common/renderer/build.gradle index 83957546..fa79e153 100644 --- a/code/common/renderer/build.gradle +++ b/code/common/renderer/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index fa109b0e..57342fa1 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/execution/api/build.gradle b/code/execution/api/build.gradle index 5102b613..02e8100e 100644 --- a/code/execution/api/build.gradle +++ b/code/execution/api/build.gradle @@ -8,7 +8,7 @@ jar.archiveBaseName = 'execution-api' java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 57de7320..3824a8c1 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/adblock/build.gradle b/code/features-convert/adblock/build.gradle index f28fe998..11a7c5ea 100644 --- a/code/features-convert/adblock/build.gradle +++ b/code/features-convert/adblock/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle index 3541b5ec..bc5f44d8 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/features-convert/anchor-keywords/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle index 6fd5671d..73aebd49 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/features-convert/data-extractors/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/keyword-extraction/build.gradle b/code/features-convert/keyword-extraction/build.gradle index 7912e246..c63fc263 100644 --- a/code/features-convert/keyword-extraction/build.gradle +++ b/code/features-convert/keyword-extraction/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/pubdate/build.gradle b/code/features-convert/pubdate/build.gradle index bebd3d8e..1a33a4a7 100644 --- a/code/features-convert/pubdate/build.gradle +++ b/code/features-convert/pubdate/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/reddit-json/build.gradle b/code/features-convert/reddit-json/build.gradle index a9aa3bd6..afbc6961 100644 --- a/code/features-convert/reddit-json/build.gradle +++ b/code/features-convert/reddit-json/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/stackexchange-xml/build.gradle b/code/features-convert/stackexchange-xml/build.gradle index d693047b..bda05817 100644 --- a/code/features-convert/stackexchange-xml/build.gradle +++ b/code/features-convert/stackexchange-xml/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/summary-extraction/build.gradle b/code/features-convert/summary-extraction/build.gradle index bf6a87c5..189b317b 100644 --- a/code/features-convert/summary-extraction/build.gradle +++ b/code/features-convert/summary-extraction/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-convert/topic-detection/build.gradle b/code/features-convert/topic-detection/build.gradle index 49661c25..622d422b 100644 --- a/code/features-convert/topic-detection/build.gradle +++ b/code/features-convert/topic-detection/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle index c807b86e..16ecddd1 100644 --- a/code/features-crawl/content-type/build.gradle +++ b/code/features-crawl/content-type/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/features-crawl/crawl-blocklist/build.gradle index a6fc4f91..98741b80 100644 --- a/code/features-crawl/crawl-blocklist/build.gradle +++ b/code/features-crawl/crawl-blocklist/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-crawl/link-parser/build.gradle b/code/features-crawl/link-parser/build.gradle index 2dd04a5c..f69a255b 100644 --- a/code/features-crawl/link-parser/build.gradle +++ b/code/features-crawl/link-parser/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-search/feedlot-client/build.gradle b/code/features-search/feedlot-client/build.gradle index c62182fe..ef42210b 100644 --- a/code/features-search/feedlot-client/build.gradle +++ b/code/features-search/feedlot-client/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-search/random-websites/build.gradle b/code/features-search/random-websites/build.gradle index ec4201ec..fb0dd3ed 100644 --- a/code/features-search/random-websites/build.gradle +++ b/code/features-search/random-websites/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/features-search/screenshots/build.gradle b/code/features-search/screenshots/build.gradle index 52572e91..54eb6542 100644 --- a/code/features-search/screenshots/build.gradle +++ b/code/features-search/screenshots/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/domain-info/api/build.gradle b/code/functions/domain-info/api/build.gradle index 0c4264ec..3ac3428e 100644 --- a/code/functions/domain-info/api/build.gradle +++ b/code/functions/domain-info/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/domain-info/build.gradle b/code/functions/domain-info/build.gradle index afb2b358..c968b0ed 100644 --- a/code/functions/domain-info/build.gradle +++ b/code/functions/domain-info/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/link-graph/aggregate/build.gradle b/code/functions/link-graph/aggregate/build.gradle index 41f89bef..213790b9 100644 --- a/code/functions/link-graph/aggregate/build.gradle +++ b/code/functions/link-graph/aggregate/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/link-graph/api/build.gradle b/code/functions/link-graph/api/build.gradle index 99dd4a36..a16163b2 100644 --- a/code/functions/link-graph/api/build.gradle +++ b/code/functions/link-graph/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/link-graph/partition/build.gradle b/code/functions/link-graph/partition/build.gradle index faca528f..766ed56c 100644 --- a/code/functions/link-graph/partition/build.gradle +++ b/code/functions/link-graph/partition/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/math/api/build.gradle b/code/functions/math/api/build.gradle index 6811a06d..90c536b0 100644 --- a/code/functions/math/api/build.gradle +++ b/code/functions/math/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/math/build.gradle b/code/functions/math/build.gradle index c1fe528d..814f57bc 100644 --- a/code/functions/math/build.gradle +++ b/code/functions/math/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index 5339cbf4..727b5b86 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/functions/search-query/build.gradle b/code/functions/search-query/build.gradle index 76c520fb..7b792b48 100644 --- a/code/functions/search-query/build.gradle +++ b/code/functions/search-query/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/index/api/build.gradle b/code/index/api/build.gradle index 50b3d726..d07a24eb 100644 --- a/code/index/api/build.gradle +++ b/code/index/api/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/index/build.gradle b/code/index/build.gradle index 7d34bab4..37275b0a 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index daf9bc0b..96526205 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/index/index-journal/build.gradle b/code/index/index-journal/build.gradle index c6186cf7..988ce618 100644 --- a/code/index/index-journal/build.gradle +++ b/code/index/index-journal/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/index/index-reverse/build.gradle b/code/index/index-reverse/build.gradle index a9523656..36367546 100644 --- a/code/index/index-reverse/build.gradle +++ b/code/index/index-reverse/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/index/query/build.gradle b/code/index/query/build.gradle index 7977ad73..615d9fb7 100644 --- a/code/index/query/build.gradle +++ b/code/index/query/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/array/build.gradle b/code/libraries/array/build.gradle index 88e27107..d7858a21 100644 --- a/code/libraries/array/build.gradle +++ b/code/libraries/array/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } @@ -30,7 +30,7 @@ jmh { } tasks.withType(me.champeau.jmh.WithJavaToolchain).configureEach { javaLauncher.set(javaToolchains.launcherFor { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) }) } tasks.withType(me.champeau.jmh.JmhBytecodeGeneratorTask).configureEach { diff --git a/code/libraries/big-string/build.gradle b/code/libraries/big-string/build.gradle index 2f67aa7f..c6d4c00f 100644 --- a/code/libraries/big-string/build.gradle +++ b/code/libraries/big-string/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/blocking-thread-pool/build.gradle b/code/libraries/blocking-thread-pool/build.gradle index 8c5609a5..0a513f92 100644 --- a/code/libraries/blocking-thread-pool/build.gradle +++ b/code/libraries/blocking-thread-pool/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/braille-block-punch-cards/build.gradle b/code/libraries/braille-block-punch-cards/build.gradle index 673d944f..d6b8c6e6 100644 --- a/code/libraries/braille-block-punch-cards/build.gradle +++ b/code/libraries/braille-block-punch-cards/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/btree/build.gradle b/code/libraries/btree/build.gradle index 99917255..37060dd5 100644 --- a/code/libraries/btree/build.gradle +++ b/code/libraries/btree/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/easy-lsh/build.gradle b/code/libraries/easy-lsh/build.gradle index 74fd976e..6c66bdde 100644 --- a/code/libraries/easy-lsh/build.gradle +++ b/code/libraries/easy-lsh/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/geo-ip/build.gradle b/code/libraries/geo-ip/build.gradle index a47f64a5..4fd467aa 100644 --- a/code/libraries/geo-ip/build.gradle +++ b/code/libraries/geo-ip/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/guarded-regex/build.gradle b/code/libraries/guarded-regex/build.gradle index 6faa6f3c..f310116e 100644 --- a/code/libraries/guarded-regex/build.gradle +++ b/code/libraries/guarded-regex/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/language-processing/build.gradle b/code/libraries/language-processing/build.gradle index 56abe731..cc745397 100644 --- a/code/libraries/language-processing/build.gradle +++ b/code/libraries/language-processing/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/message-queue/build.gradle b/code/libraries/message-queue/build.gradle index 5931a76e..d2618d95 100644 --- a/code/libraries/message-queue/build.gradle +++ b/code/libraries/message-queue/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/next-prime/build.gradle b/code/libraries/next-prime/build.gradle index 8c5609a5..0a513f92 100644 --- a/code/libraries/next-prime/build.gradle +++ b/code/libraries/next-prime/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/random-write-funnel/build.gradle b/code/libraries/random-write-funnel/build.gradle index 9d23af16..a7acb1fa 100644 --- a/code/libraries/random-write-funnel/build.gradle +++ b/code/libraries/random-write-funnel/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 4d7e42c5..67fb44ae 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/libraries/test-helpers/build.gradle b/code/libraries/test-helpers/build.gradle index 0066220f..875e636d 100644 --- a/code/libraries/test-helpers/build.gradle +++ b/code/libraries/test-helpers/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/process-models/crawl-spec/build.gradle b/code/process-models/crawl-spec/build.gradle index 551f2c6e..70850445 100644 --- a/code/process-models/crawl-spec/build.gradle +++ b/code/process-models/crawl-spec/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index 9b846502..5926e03d 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/process-models/processed-data/build.gradle b/code/process-models/processed-data/build.gradle index 910cecc6..9668d0b8 100644 --- a/code/process-models/processed-data/build.gradle +++ b/code/process-models/processed-data/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/process-models/work-log/build.gradle b/code/process-models/work-log/build.gradle index fbade272..c56174ca 100644 --- a/code/process-models/work-log/build.gradle +++ b/code/process-models/work-log/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/process-mqapi/build.gradle b/code/process-mqapi/build.gradle index b1313c62..514ca034 100644 --- a/code/process-mqapi/build.gradle +++ b/code/process-mqapi/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 77b0b025..20532994 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 0bc95a4c..6ed789c6 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index 5e48deea..ccec9b30 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index b44c27fa..c396c52f 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -6,7 +6,7 @@ plugins { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/processes/test-data/build.gradle b/code/processes/test-data/build.gradle index 4f184c80..4c2fef49 100644 --- a/code/processes/test-data/build.gradle +++ b/code/processes/test-data/build.gradle @@ -5,7 +5,7 @@ plugins { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/processes/website-adjacencies-calculator/build.gradle b/code/processes/website-adjacencies-calculator/build.gradle index a6e63d1f..63e56286 100644 --- a/code/processes/website-adjacencies-calculator/build.gradle +++ b/code/processes/website-adjacencies-calculator/build.gradle @@ -6,7 +6,7 @@ plugins { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/services-application/api-service/build.gradle b/code/services-application/api-service/build.gradle index 726f703d..9fa51a9f 100644 --- a/code/services-application/api-service/build.gradle +++ b/code/services-application/api-service/build.gradle @@ -3,12 +3,12 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/services-application/dating-service/build.gradle b/code/services-application/dating-service/build.gradle index 2435383c..a8cfd6e1 100644 --- a/code/services-application/dating-service/build.gradle +++ b/code/services-application/dating-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/services-application/explorer-service/build.gradle b/code/services-application/explorer-service/build.gradle index 0cac3ad4..da7e8a2e 100644 --- a/code/services-application/explorer-service/build.gradle +++ b/code/services-application/explorer-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index 561073d0..d1a64e2e 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -5,7 +5,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -18,7 +18,7 @@ tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } sass { diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index f1a25b0f..3f51937f 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -15,7 +15,7 @@ tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index 2f34648f..787e3740 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -2,12 +2,12 @@ plugins { id 'java' id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 435e5ec6..08d80ff5 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -17,7 +17,7 @@ tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 73913d45..63b2ca5a 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/services-core/query-service/build.gradle b/code/services-core/query-service/build.gradle index 6857e5e8..7e80ac81 100644 --- a/code/services-core/query-service/build.gradle +++ b/code/services-core/query-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } application { @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/tools/crawl-data-unfcker/build.gradle b/code/tools/crawl-data-unfcker/build.gradle index 40ec3bcb..755fba5e 100644 --- a/code/tools/crawl-data-unfcker/build.gradle +++ b/code/tools/crawl-data-unfcker/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 1923d4dd..d2cbc29b 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/tools/load-test/build.gradle b/code/tools/load-test/build.gradle index 744333c8..ca14347e 100644 --- a/code/tools/load-test/build.gradle +++ b/code/tools/load-test/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/code/tools/screenshot-capture-tool/build.gradle b/code/tools/screenshot-capture-tool/build.gradle index 75aef932..27b7ee89 100644 --- a/code/tools/screenshot-capture-tool/build.gradle +++ b/code/tools/screenshot-capture-tool/build.gradle @@ -3,12 +3,12 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.0' + id 'com.google.cloud.tools.jib' version '3.4.1' } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/settings.gradle b/settings.gradle index 06dc71e2..cfee1a8b 100644 --- a/settings.gradle +++ b/settings.gradle @@ -128,7 +128,7 @@ dependencyResolutionManagement { versionCatalogs { libs { - library('lombok', 'org.projectlombok', 'lombok').version('1.18.30') + library('lombok', 'org.projectlombok', 'lombok').version('1.18.32') library('mariadb-client', 'org.mariadb.jdbc', 'mariadb-java-client').version('3.0.6') library('hikaricp', 'com.zaxxer:HikariCP:5.0.1') diff --git a/third-party/commons-codec/build.gradle b/third-party/commons-codec/build.gradle index 9fbe9a5d..d5974fb9 100644 --- a/third-party/commons-codec/build.gradle +++ b/third-party/commons-codec/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/count-min-sketch/build.gradle b/third-party/count-min-sketch/build.gradle index b49a1ccd..52becdd0 100644 --- a/third-party/count-min-sketch/build.gradle +++ b/third-party/count-min-sketch/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/encyclopedia-marginalia-nu/build.gradle b/third-party/encyclopedia-marginalia-nu/build.gradle index faf794ec..f104b712 100644 --- a/third-party/encyclopedia-marginalia-nu/build.gradle +++ b/third-party/encyclopedia-marginalia-nu/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/monkey-patch-opennlp/build.gradle b/third-party/monkey-patch-opennlp/build.gradle index 1e63117b..a8aa4366 100644 --- a/third-party/monkey-patch-opennlp/build.gradle +++ b/third-party/monkey-patch-opennlp/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/openzim/build.gradle b/third-party/openzim/build.gradle index 2a1c9da0..12a35aa1 100644 --- a/third-party/openzim/build.gradle +++ b/third-party/openzim/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle index c7cafc81..0e9ed00e 100644 --- a/third-party/parquet-floor/build.gradle +++ b/third-party/parquet-floor/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/porterstemmer/build.gradle b/third-party/porterstemmer/build.gradle index b49a1ccd..52becdd0 100644 --- a/third-party/porterstemmer/build.gradle +++ b/third-party/porterstemmer/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/rdrpostagger/build.gradle b/third-party/rdrpostagger/build.gradle index 39308e53..1b076c14 100644 --- a/third-party/rdrpostagger/build.gradle +++ b/third-party/rdrpostagger/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } diff --git a/third-party/symspell/build.gradle b/third-party/symspell/build.gradle index b49a1ccd..52becdd0 100644 --- a/third-party/symspell/build.gradle +++ b/third-party/symspell/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(21)) + languageVersion.set(JavaLanguageVersion.of(22)) } } From 19684858818bcbb50d59b6ec1640735407b65207 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Mar 2024 12:12:05 +0100 Subject: [PATCH 10/47] (docs) Upgrade to JDK22 --- run/readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run/readme.md b/run/readme.md index 0a890feb..5d87e93f 100644 --- a/run/readme.md +++ b/run/readme.md @@ -11,7 +11,7 @@ documentation. **Docker** - It is a bit of a pain to install, but if you follow [this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems. -**JDK 21** - The code uses Java 21 preview features. +**JDK 22** - The code uses Java 22 preview features. The civilized way of installing this is to use [SDKMAN](https://sdkman.io/); graalce is a good distribution choice but it doesn't matter too much. From bd0704d5a438ff0452bbad42fe0c5e86d916c113 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Mar 2024 14:24:25 +0100 Subject: [PATCH 11/47] (*) Fix JDK22 migration issues A few bizarre build errors cropped up when migrating to JDK22. Not at all sure what caused them, but they were easy to mitigate. --- code/features-convert/anchor-keywords/build.gradle | 1 + .../nu/marginalia/converting/writer/ConverterBatchWriter.java | 2 ++ 2 files changed, 3 insertions(+) diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle index bc5f44d8..ae92b066 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/features-convert/anchor-keywords/build.gradle @@ -24,6 +24,7 @@ dependencies { implementation libs.bundles.slf4j implementation libs.guice + implementation libs.trove implementation libs.bundles.mariadb implementation libs.duckdb implementation libs.notnull diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 73333320..14972693 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -2,6 +2,7 @@ package nu.marginalia.converting.writer; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; +import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; @@ -61,6 +62,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter } @Override + @SneakyThrows public void writeProcessedDomain(ProcessedDomain domain) { var results = ForkJoinPool.commonPool().invokeAll( writeTasks(domain) From f82ebd7716522f45f58871916af2c3cd5149bd76 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:01:21 +0100 Subject: [PATCH 12/47] (WIP) Query rendering finally beginning to look like it works --- .../query_parser/model/QWordGraph.java | 365 ++++++++++++++---- .../query_parser/model/QWordGraphTest.java | 111 ++++++ 2 files changed, 408 insertions(+), 68 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 474c4788..1d8fcd70 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -49,9 +49,9 @@ public class QWordGraph implements Iterable { var newWord = new QWord(wordId++, original, word); - for (var prev : getPrev(original)) + for (var prev : getPrevOriginal(original)) addLink(prev, newWord); - for (var next : getNext(original)) + for (var next : getNextOriginal(original)) addLink(newWord, next); } @@ -93,6 +93,12 @@ public class QWordGraph implements Iterable { .collect(Collectors.toList()); } + public QWord node(String word) { + return nodes().stream() + .filter(n -> n.word().equals(word)) + .findFirst() + .orElseThrow(); + } public List getNext(QWord word) { return fromTo.getOrDefault(word, List.of()); @@ -147,34 +153,25 @@ public class QWordGraph implements Iterable { return false; } - /** Returns a set of all nodes that are between 'begin' and 'end' in the graph, - * including the terminal nodes. This is useful for breaking up the graph into - * smaller components that can be evaluated in any order. - *

    - * It is assumed that there is a path from 'begin' to 'end' in the graph, and no - * other paths that bypass 'end'. - *

    - * The nodes are returned in the order they are encountered in a breadth-first search. - */ - public List nodesBetween(QWord begin, QWord end) { - List edge = new ArrayList<>(); - List visited = new ArrayList<>(); + public Map> forwardReachability() { + Map> ret = new HashMap<>(); - visited.add(begin); - edge.add(begin); + Set edge = Set.of(QWord.beg()); + Set visited = new HashSet<>(); while (!edge.isEmpty()) { - List next = new ArrayList<>(); + Set next = new LinkedHashSet<>(); for (var w : edge) { - if (Objects.equals(w, end)) - continue; - if (w.isEnd()) { - assert end.isEnd() : "Graph has a path beyond the specified end vertex " + end; + for (var n : getNext(w)) { + var set = ret.computeIfAbsent(n, k -> new HashSet<>()); + + set.add(w); + set.addAll(ret.getOrDefault(w, Set.of())); + + next.add(n); } - - next.addAll(getNext(w)); } next.removeAll(visited); @@ -182,67 +179,299 @@ public class QWordGraph implements Iterable { edge = next; } - return visited.stream().distinct().toList(); + return ret; } - /** Returns a list of subgraphs that are connected on the path from - * 'begin' to 'end'. This is useful for breaking up the graph into - * smaller components that can be evaluated in any order. - *

    - * The subgraphs are specified by their predecessor and successor nodes, - * - */ - public List getSubgraphs(QWord begin, QWord end) { - // Short-circuit for the common and simple case - if (getNext(begin).equals(List.of(end))) - return List.of(new QWordGraphLink(begin, end)); + public Map> reverseReachability() { + Map> ret = new HashMap<>(); - List subgraphs = new ArrayList<>(); + Set edge = Set.of(QWord.end()); + Set visited = new HashSet<>(); - List points = nodesBetween(begin, end) - .stream() - .filter(w -> !isBypassed(w, begin, end)) - .toList(); + while (!edge.isEmpty()) { + Set prev = new LinkedHashSet<>(); - for (int i = 0; i < points.size() - 1; i++) { - var a = points.get(i); - var b = points.get(i+1); + for (var w : edge) { - subgraphs.add(new QWordGraphLink(a, b)); + for (var p : getPrev(w)) { + var set = ret.computeIfAbsent(p, k -> new HashSet<>()); + + set.add(w); + set.addAll(ret.getOrDefault(w, Set.of())); + + prev.add(p); + } + } + + prev.removeAll(visited); + visited.addAll(prev); + edge = prev; } - return subgraphs; + return ret; + } + + public record ReachabilityData(List sortedNodes, + Map sortOrder, + + Map> forward, + Map> reverse) + { + public Set forward(QWord node) { + return forward.getOrDefault(node, Set.of()); + } + public Set reverse(QWord node) { + return reverse.getOrDefault(node, Set.of()); + } + + public Comparator topologicalComparator() { + return Comparator.comparing(sortOrder::get); + } + + } + + /** Gather data about graph reachability, including the topological order of nodes */ + public ReachabilityData reachability() { + var forwardReachability = forwardReachability(); + var reverseReachability = reverseReachability(); + + List nodes = new ArrayList<>(nodes()); + nodes.sort(new SetMembershipComparator<>(forwardReachability)); + + Map topologicalOrder = new HashMap<>(); + for (int i = 0; i < nodes.size(); i++) { + topologicalOrder.put(nodes.get(i), i); + } + + return new ReachabilityData(nodes, topologicalOrder, forwardReachability, reverseReachability); + } + + static class SetMembershipComparator implements Comparator { + private final Map> membership; + + SetMembershipComparator(Map> membership) { + this.membership = membership; + } + + @Override + public int compare(T o1, T o2) { + return Boolean.compare(isIn(o1, o2), isIn(o2, o1)); + } + + private boolean isIn(T a, T b) { + return membership.getOrDefault(a, Set.of()).contains(b); + } } public String compileToQuery() { - return compileToQuery(QWord.beg(), QWord.end()); + var wp = new WordPaths(QWord.beg(), QWord.end()); + return wp.render(reachability()); } - public String compileToQuery(QWord begin, QWord end) { - StringJoiner sj = new StringJoiner(" "); - for (var subgraph : getSubgraphs(begin, end)) { - if (getNext(subgraph.from).equals(List.of(subgraph.to))) { - if (subgraph.from.isBeg()) - continue; + class WordPaths { + private final Set paths; - sj.add(subgraph.from.word()); - } - else { - StringJoiner branchJoiner = new StringJoiner(" | ", "( ", " )"); - if (Objects.equals(subgraph.from, begin)) { - for (QWord path : getNext(subgraph.from)) { - branchJoiner.add(compileToQuery(path, subgraph.to)); - } - } - else { - branchJoiner.add(compileToQuery(subgraph.from, subgraph.to)); - } - sj.add(branchJoiner.toString()); - } + public final QWord begin; + public final QWord end; + + public WordPaths(Collection paths) { + this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); + + begin = null; + end = null; } - return sj.toString(); + public WordPaths(QWord begin, QWord end) { + this.begin = begin; + this.end = end; + + this.paths = Collections.unmodifiableSet(listPaths()); + } + + public String render(ReachabilityData reachability) { + if (paths.size() == 1) { + return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); + } + + Map commonality = paths.stream().flatMap(WordPath::stream) + .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + + Set commonToAll = new HashSet<>(); + Set notCommonToAll = new HashSet<>(); + + commonality.forEach((k, v) -> { + if (v == paths.size()) { + commonToAll.add(k); + } + else { + notCommonToAll.add(k); + } + }); + + StringJoiner concat = new StringJoiner(" "); + if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + + commonToAll.stream() + .sorted(reachability.topologicalComparator()) + .map(QWord::word) + .forEach(concat::add); + + // Deal portion of the paths that do not all share a common word + if (!notCommonToAll.isEmpty()) { + + List nonOverlappingPortions = new ArrayList<>(); + + for (var path : paths) { + // Project the path onto the divergent nodes (i.e. remove common nodes) + var np = path.project(notCommonToAll); + if (np.isEmpty()) + continue; + nonOverlappingPortions.add(np); + } + + if (nonOverlappingPortions.size() > 1) { + var wp = new WordPaths(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } + else if (!nonOverlappingPortions.isEmpty()) { + var wp = new WordPaths(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } + } + } + else if (commonality.size() > 1) { // The case where no words are common to all paths + + // Sort the words by commonality, so that we can consider the most common words first + List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); + + Map> pathsByCommonWord = new HashMap<>(); + + // Mutable copy of the paths + List allDivergentPaths = new ArrayList<>(paths); + + for (var qw : byCommonality) { + if (allDivergentPaths.isEmpty()) + break; + + var iter = allDivergentPaths.iterator(); + while (iter.hasNext()) { + var path = iter.next(); + + if (!path.contains(qw)) { + continue; + } + + pathsByCommonWord + .computeIfAbsent(qw, k -> new ArrayList<>()) + .add(path.without(qw)); // Remove the common word from the path + + iter.remove(); + } + } + + var branches = pathsByCommonWord.entrySet().stream().map(e -> { + String commonWord = e.getKey().word(); + String branchPart = new WordPaths(e.getValue()).render(reachability); + return STR."\{commonWord} \{branchPart}"; + }) + .collect(Collectors.joining(" | ", " ( ", " ) ")); + + concat.add(branches); + + } + + // Remove any double spaces that may have been introduced + return concat.toString().replaceAll("\\s+", " "); + } + + + public Set listPaths() { + assert begin != null; + assert end != null; + + Set paths = new HashSet<>(); + listPaths(paths, new LinkedList<>(), begin, end); + return paths; + } + + private void listPaths(Set acc, + LinkedList stack, + QWord start, + QWord end) + { + stack.addLast(start); + + if (Objects.equals(start, end)) { + var nodes = new HashSet<>(stack); + + nodes.remove(this.begin); + nodes.remove(this.end); + + acc.add(new WordPath(nodes)); + } + else { + for (var next : getNext(start)) { + listPaths(acc, stack, next, end); + } + } + + stack.removeLast(); + } + } + + public static class WordPath { + private final Set nodes; + + WordPath(Collection nodes) { + this.nodes = new HashSet<>(nodes); + } + + public boolean contains(QWord node) { + return nodes.contains(node); + } + + public WordPath without(QWord word) { + Set newNodes = new HashSet<>(nodes); + newNodes.remove(word); + return new WordPath(newNodes); + } + + public Stream stream() { + return nodes.stream(); + } + + public WordPath project(Set nodes) { + return new WordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + WordPath wordPath = (WordPath) o; + + return nodes.equals(wordPath.nodes); + } + + public boolean isEmpty() { + return nodes.isEmpty(); + } + + public int size() { + return nodes.size(); + } + + @Override + public int hashCode() { + return nodes.hashCode(); + } + + @Override + public String toString() { + return STR."WordPath{nodes=\{nodes}\{'}'}"; + } } @NotNull @@ -258,7 +487,7 @@ public class QWordGraph implements Iterable { @Override public QWord next() { - pos = getNextOriginal(pos).get(0); + pos = getNextOriginal(pos).getFirst(); return pos; } }; diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index bd16b3cb..276d8697 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -2,6 +2,11 @@ package nu.marginalia.functions.searchquery.query_parser.model; import org.junit.jupiter.api.Test; +import java.util.Comparator; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; + class QWordGraphTest { @Test @@ -11,12 +16,14 @@ class QWordGraphTest { System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println(graph.compileToQuery()); + graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); System.out.println("--"); graph.addVariant(graph.nodes().get(1), "sup"); System.out.println(graph.compileToQuery()); + graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); @@ -33,5 +40,109 @@ class QWordGraphTest { graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); + graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); + } + + @Test + void forwardReachability() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("b"), "d"); + + var reachability = graph.forwardReachability(); + + System.out.println(reachability.get(graph.node("a"))); + System.out.println(reachability.get(graph.node("b"))); + System.out.println(reachability.get(graph.node("c"))); + System.out.println(reachability.get(graph.node("d"))); + + assertEquals(Set.of(graph.node(" ^ ")), reachability.get(graph.node("a"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a")), reachability.get(graph.node("b"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a")), reachability.get(graph.node("d"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a"), graph.node("b"), graph.node("d")), reachability.get(graph.node("c"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a"), graph.node("b"), graph.node("d"), graph.node("c")), reachability.get(graph.node(" $ "))); + } + + + @Test + void reverseReachability() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("b"), "d"); + + var reachability = graph.reverseReachability(); + + System.out.println(reachability.get(graph.node("a"))); + System.out.println(reachability.get(graph.node("b"))); + System.out.println(reachability.get(graph.node("c"))); + System.out.println(reachability.get(graph.node("d"))); + + assertEquals(Set.of(graph.node(" $ ")), reachability.get(graph.node("c"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c")), reachability.get(graph.node("b"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c")), reachability.get(graph.node("d"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c"), graph.node("b"), graph.node("d")), reachability.get(graph.node("a"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c"), graph.node("b"), graph.node("d"), graph.node("a")), reachability.get(graph.node(" ^ "))); + } + + @Test + void testCompile1() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("b"), "d"); + + assertEquals(" ^ a(b|d)c $ ", graph.compileToQuery()); + } + @Test + void testCompile2() { + // Construct a graph like + + // ^ - a - b - c - $ + QWordGraph graph = new QWordGraph("a", "b", "c"); + + assertEquals(" ^ abc $ ", graph.compileToQuery()); + } + + @Test + void testCompile3() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("a"), "d"); + assertEquals(" ^ (a|d)bc $ ", graph.compileToQuery()); + } + + @Test + void testCompile4() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("c"), "d"); + assertEquals(" ^ ab(c|d) $ ", graph.compileToQuery()); + } + + @Test + void testCompile5() { + // Construct a graph like + + // /- e -\ + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("c"), "d"); + graph.addVariant(graph.node("b"), "e"); + assertEquals(" ^ a(b|e)(c|d) $ ", graph.compileToQuery()); } } \ No newline at end of file From 4cc11e183cb2526f80dda547206eb3de1e31c91e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:11:26 +0100 Subject: [PATCH 13/47] (qs, WIP) Fix output determinism, fix tests --- .../searchquery/query_parser/model/QWordGraph.java | 6 ++++-- .../query_parser/model/QWordGraphTest.java | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 1d8fcd70..10aae867 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -370,7 +370,9 @@ public class QWordGraph implements Iterable { } } - var branches = pathsByCommonWord.entrySet().stream().map(e -> { + var branches = pathsByCommonWord.entrySet().stream() + .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) + .map(e -> { String commonWord = e.getKey().word(); String branchPart = new WordPaths(e.getValue()).render(reachability); return STR."\{commonWord} \{branchPart}"; @@ -382,7 +384,7 @@ public class QWordGraph implements Iterable { } // Remove any double spaces that may have been introduced - return concat.toString().replaceAll("\\s+", " "); + return concat.toString().replaceAll("\\s+", " ").trim(); } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index 276d8697..f3201b9d 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -99,8 +99,9 @@ class QWordGraphTest { QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("b"), "d"); - assertEquals(" ^ a(b|d)c $ ", graph.compileToQuery()); + assertEquals("a c ( b | d )", graph.compileToQuery()); } + @Test void testCompile2() { // Construct a graph like @@ -108,7 +109,7 @@ class QWordGraphTest { // ^ - a - b - c - $ QWordGraph graph = new QWordGraph("a", "b", "c"); - assertEquals(" ^ abc $ ", graph.compileToQuery()); + assertEquals("a b c", graph.compileToQuery()); } @Test @@ -119,7 +120,7 @@ class QWordGraphTest { // \- d -/ QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("a"), "d"); - assertEquals(" ^ (a|d)bc $ ", graph.compileToQuery()); + assertEquals("b c ( a | d )", graph.compileToQuery()); } @Test @@ -130,7 +131,7 @@ class QWordGraphTest { // \- d -/ QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("c"), "d"); - assertEquals(" ^ ab(c|d) $ ", graph.compileToQuery()); + assertEquals("a b ( c | d )", graph.compileToQuery()); } @Test @@ -143,6 +144,6 @@ class QWordGraphTest { QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("c"), "d"); graph.addVariant(graph.node("b"), "e"); - assertEquals(" ^ a(b|e)(c|d) $ ", graph.compileToQuery()); + assertEquals("a ( b ( c | d ) | c e )", graph.compileToQuery()); } } \ No newline at end of file From fe62593286832ba096287a19a4c709c8bd8ec67c Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:26:54 +0100 Subject: [PATCH 14/47] (qs, WIP) Break up code and tidy it up a bit --- .../query_parser/model/QWordGraph.java | 210 +----------------- .../model/QWordGraphPathLister.java | 57 +++++ .../query_parser/model/QWordPath.java | 66 ++++++ .../model/QWordPathsRenderer.java | 119 ++++++++++ 4 files changed, 243 insertions(+), 209 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 10aae867..20e4320d 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -264,218 +264,10 @@ public class QWordGraph implements Iterable { } public String compileToQuery() { - var wp = new WordPaths(QWord.beg(), QWord.end()); - return wp.render(reachability()); + return QWordPathsRenderer.render(this); } - class WordPaths { - private final Set paths; - - public final QWord begin; - public final QWord end; - - public WordPaths(Collection paths) { - this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); - - begin = null; - end = null; - } - - public WordPaths(QWord begin, QWord end) { - this.begin = begin; - this.end = end; - - this.paths = Collections.unmodifiableSet(listPaths()); - } - - public String render(ReachabilityData reachability) { - if (paths.size() == 1) { - return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); - } - - Map commonality = paths.stream().flatMap(WordPath::stream) - .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); - - Set commonToAll = new HashSet<>(); - Set notCommonToAll = new HashSet<>(); - - commonality.forEach((k, v) -> { - if (v == paths.size()) { - commonToAll.add(k); - } - else { - notCommonToAll.add(k); - } - }); - - StringJoiner concat = new StringJoiner(" "); - if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths - - commonToAll.stream() - .sorted(reachability.topologicalComparator()) - .map(QWord::word) - .forEach(concat::add); - - // Deal portion of the paths that do not all share a common word - if (!notCommonToAll.isEmpty()) { - - List nonOverlappingPortions = new ArrayList<>(); - - for (var path : paths) { - // Project the path onto the divergent nodes (i.e. remove common nodes) - var np = path.project(notCommonToAll); - if (np.isEmpty()) - continue; - nonOverlappingPortions.add(np); - } - - if (nonOverlappingPortions.size() > 1) { - var wp = new WordPaths(nonOverlappingPortions); - concat.add(wp.render(reachability)); - } - else if (!nonOverlappingPortions.isEmpty()) { - var wp = new WordPaths(nonOverlappingPortions); - concat.add(wp.render(reachability)); - } - } - } - else if (commonality.size() > 1) { // The case where no words are common to all paths - - // Sort the words by commonality, so that we can consider the most common words first - List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); - - Map> pathsByCommonWord = new HashMap<>(); - - // Mutable copy of the paths - List allDivergentPaths = new ArrayList<>(paths); - - for (var qw : byCommonality) { - if (allDivergentPaths.isEmpty()) - break; - - var iter = allDivergentPaths.iterator(); - while (iter.hasNext()) { - var path = iter.next(); - - if (!path.contains(qw)) { - continue; - } - - pathsByCommonWord - .computeIfAbsent(qw, k -> new ArrayList<>()) - .add(path.without(qw)); // Remove the common word from the path - - iter.remove(); - } - } - - var branches = pathsByCommonWord.entrySet().stream() - .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) - .map(e -> { - String commonWord = e.getKey().word(); - String branchPart = new WordPaths(e.getValue()).render(reachability); - return STR."\{commonWord} \{branchPart}"; - }) - .collect(Collectors.joining(" | ", " ( ", " ) ")); - - concat.add(branches); - - } - - // Remove any double spaces that may have been introduced - return concat.toString().replaceAll("\\s+", " ").trim(); - } - - - public Set listPaths() { - assert begin != null; - assert end != null; - - Set paths = new HashSet<>(); - listPaths(paths, new LinkedList<>(), begin, end); - return paths; - } - - private void listPaths(Set acc, - LinkedList stack, - QWord start, - QWord end) - { - stack.addLast(start); - - if (Objects.equals(start, end)) { - var nodes = new HashSet<>(stack); - - nodes.remove(this.begin); - nodes.remove(this.end); - - acc.add(new WordPath(nodes)); - } - else { - for (var next : getNext(start)) { - listPaths(acc, stack, next, end); - } - } - - stack.removeLast(); - } - } - - public static class WordPath { - private final Set nodes; - - WordPath(Collection nodes) { - this.nodes = new HashSet<>(nodes); - } - - public boolean contains(QWord node) { - return nodes.contains(node); - } - - public WordPath without(QWord word) { - Set newNodes = new HashSet<>(nodes); - newNodes.remove(word); - return new WordPath(newNodes); - } - - public Stream stream() { - return nodes.stream(); - } - - public WordPath project(Set nodes) { - return new WordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - WordPath wordPath = (WordPath) o; - - return nodes.equals(wordPath.nodes); - } - - public boolean isEmpty() { - return nodes.isEmpty(); - } - - public int size() { - return nodes.size(); - } - - @Override - public int hashCode() { - return nodes.hashCode(); - } - - @Override - public String toString() { - return STR."WordPath{nodes=\{nodes}\{'}'}"; - } - } - @NotNull @Override public Iterator iterator() { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java new file mode 100644 index 00000000..979a419b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java @@ -0,0 +1,57 @@ +package nu.marginalia.functions.searchquery.query_parser.model; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Objects; +import java.util.Set; + +/** Utility class for listing each path in a {@link QWordGraph}, from the beginning node to the end. + * Normally this would be a risk for combinatorial explosion, but in practice the graph will be constructed + * in a way that avoids this risk. + * */ +public class QWordGraphPathLister { + private final QWordGraph graph; + + public QWordGraphPathLister(QWordGraph graph) { + this.graph = graph; + } + + static Set listPaths(QWordGraph graph) { + return new QWordGraphPathLister(graph).listPaths(); + } + + Set listPaths() { + + Set paths = new HashSet<>(); + listPaths(paths, new LinkedList<>(), QWord.beg(), QWord.end()); + return paths; + } + + void listPaths(Set acc, + LinkedList stack, + QWord start, + QWord end) + { + stack.addLast(start); + + if (Objects.equals(start, end)) { + var nodes = new HashSet<>(stack); + + // Remove the start and end nodes from the path, as these are + // not part of the query but merely used to simplify the construction + // of the graph + + nodes.remove(QWord.beg()); + nodes.remove(QWord.end()); + + acc.add(new QWordPath(nodes)); + } + else { + for (var next : graph.getNext(start)) { + listPaths(acc, stack, next, end); + } + } + + stack.removeLast(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java new file mode 100644 index 00000000..f8e859e3 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java @@ -0,0 +1,66 @@ +package nu.marginalia.functions.searchquery.query_parser.model; + +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** Represents a path of QWords in a QWordGraph. Since the order of operations when + * evaluating a query does not affect its semantics, only performance, the order of the + * nodes in the path is not significant; thus the path is represented with a set. + */ +public class QWordPath { + private final Set nodes; + + QWordPath(Collection nodes) { + this.nodes = new HashSet<>(nodes); + } + + public boolean contains(QWord node) { + return nodes.contains(node); + } + + /** Construct a new path by removing a word from the path. */ + public QWordPath without(QWord word) { + Set newNodes = new HashSet<>(nodes); + newNodes.remove(word); + return new QWordPath(newNodes); + } + + public Stream stream() { + return nodes.stream(); + } + + public QWordPath project(Set nodes) { + return new QWordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + QWordPath wordPath = (QWordPath) o; + + return nodes.equals(wordPath.nodes); + } + + public boolean isEmpty() { + return nodes.isEmpty(); + } + + public int size() { + return nodes.size(); + } + + @Override + public int hashCode() { + return nodes.hashCode(); + } + + @Override + public String toString() { + return STR."WordPath{nodes=\{nodes}\{'}'}"; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java new file mode 100644 index 00000000..bc55d03b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -0,0 +1,119 @@ +package nu.marginalia.functions.searchquery.query_parser.model; + +import java.util.*; +import java.util.stream.Collectors; + +/** Renders a set of QWordPaths into a human-readable infix-style expression. It's not guaranteed to find + * the globally optimal expression, but rather uses a greedy algorithm as a tradeoff in effort to outcome. + */ +class QWordPathsRenderer { + private final Set paths; + + private QWordPathsRenderer(Collection paths) { + this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); + } + + private QWordPathsRenderer(QWordGraph graph) { + this.paths = Collections.unmodifiableSet(QWordGraphPathLister.listPaths(graph)); + } + + public static String render(QWordGraph graph) { + return new QWordPathsRenderer(graph).render(graph.reachability()); + } + + String render(QWordGraph.ReachabilityData reachability) { + if (paths.size() == 1) { + return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); + } + + Map commonality = paths.stream().flatMap(QWordPath::stream) + .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + + Set commonToAll = new HashSet<>(); + Set notCommonToAll = new HashSet<>(); + + commonality.forEach((k, v) -> { + if (v == paths.size()) { + commonToAll.add(k); + } else { + notCommonToAll.add(k); + } + }); + + StringJoiner concat = new StringJoiner(" "); + if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + + commonToAll.stream() + .sorted(reachability.topologicalComparator()) + .map(QWord::word) + .forEach(concat::add); + + // Deal portion of the paths that do not all share a common word + if (!notCommonToAll.isEmpty()) { + + List nonOverlappingPortions = new ArrayList<>(); + + for (var path : paths) { + // Project the path onto the divergent nodes (i.e. remove common nodes) + var np = path.project(notCommonToAll); + if (np.isEmpty()) + continue; + nonOverlappingPortions.add(np); + } + + if (nonOverlappingPortions.size() > 1) { + var wp = new QWordPathsRenderer(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } else if (!nonOverlappingPortions.isEmpty()) { + var wp = new QWordPathsRenderer(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } + } + } else if (commonality.size() > 1) { // The case where no words are common to all paths + + // Sort the words by commonality, so that we can consider the most common words first + List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); + + Map> pathsByCommonWord = new HashMap<>(); + + // Mutable copy of the paths + List allDivergentPaths = new ArrayList<>(paths); + + for (var commonWord : byCommonality) { + if (allDivergentPaths.isEmpty()) + break; + + var iter = allDivergentPaths.iterator(); + while (iter.hasNext()) { + var path = iter.next(); + + if (!path.contains(commonWord)) { + continue; + } + + pathsByCommonWord + .computeIfAbsent(commonWord, k -> new ArrayList<>()) + .add(path.without(commonWord)); // Remove the common word from the path + + iter.remove(); + } + } + + var branches = pathsByCommonWord.entrySet().stream() + .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) // Sort by topological order to ensure consistent output + .map(e -> { + String commonWord = e.getKey().word(); + String branchPart = new QWordPathsRenderer(e.getValue()).render(reachability); + return STR."\{commonWord} \{branchPart}"; + }) + .collect(Collectors.joining(" | ", " ( ", " ) ")); + + concat.add(branches); + + } + + // Remove any double spaces that may have been introduced + return concat.toString().replaceAll("\\s+", " ").trim(); + } + +} From 15391c7a88a6e209578706418947f47ea3600b38 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:54:30 +0100 Subject: [PATCH 15/47] (qs, WIP) Tidy it up a bit --- .../searchquery/query_parser/model/QWordPathsRenderer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index bc55d03b..ff4dd60c 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -10,11 +10,11 @@ class QWordPathsRenderer { private final Set paths; private QWordPathsRenderer(Collection paths) { - this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); + this.paths = Set.copyOf(paths); } private QWordPathsRenderer(QWordGraph graph) { - this.paths = Collections.unmodifiableSet(QWordGraphPathLister.listPaths(graph)); + this.paths = Set.copyOf(QWordGraphPathLister.listPaths(graph)); } public static String render(QWordGraph graph) { From 51b0d6c0d3a25a2c0a66a11839eed9c5fd4815b2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 14:09:17 +0100 Subject: [PATCH 16/47] (qs, WIP) Tidy it up a bit --- .../query_parser/model/QWordPath.java | 2 + .../model/QWordPathsRenderer.java | 50 ++++++++++++------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java index f8e859e3..daa2a1f1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java @@ -32,6 +32,8 @@ public class QWordPath { return nodes.stream(); } + /** Construct a new path by projecting the path onto a set of nodes, such that + * the nodes in the new set is a strict subset of the provided nodes */ public QWordPath project(Set nodes) { return new QWordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index ff4dd60c..a8e96837 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -21,17 +21,26 @@ class QWordPathsRenderer { return new QWordPathsRenderer(graph).render(graph.reachability()); } + /** Render the paths into a human-readable infix-style expression. + *

    + * This method is recursive, but the recursion depth is limited by the + * maximum length of the paths, which is hard limited to a value typically around 10, + * so we don't need to worry about stack overflows here... + */ String render(QWordGraph.ReachabilityData reachability) { if (paths.size() == 1) { return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); } + // Find the commonality of words in the paths + Map commonality = paths.stream().flatMap(QWordPath::stream) .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); - Set commonToAll = new HashSet<>(); - Set notCommonToAll = new HashSet<>(); + // Break the words into two categories: those that are common to all paths, and those that are not + List commonToAll = new ArrayList<>(); + Set notCommonToAll = new HashSet<>(); commonality.forEach((k, v) -> { if (v == paths.size()) { commonToAll.add(k); @@ -40,33 +49,32 @@ class QWordPathsRenderer { } }); - StringJoiner concat = new StringJoiner(" "); - if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + StringJoiner resultJoiner = new StringJoiner(" "); - commonToAll.stream() - .sorted(reachability.topologicalComparator()) - .map(QWord::word) - .forEach(concat::add); + if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + commonToAll.sort(reachability.topologicalComparator()); + + for (var word : commonToAll) { + resultJoiner.add(word.word()); + } // Deal portion of the paths that do not all share a common word if (!notCommonToAll.isEmpty()) { List nonOverlappingPortions = new ArrayList<>(); + // Create a new path for each path that does not contain the common words we just printed for (var path : paths) { - // Project the path onto the divergent nodes (i.e. remove common nodes) var np = path.project(notCommonToAll); if (np.isEmpty()) continue; nonOverlappingPortions.add(np); } - if (nonOverlappingPortions.size() > 1) { + // Recurse into the non-overlapping portions + if (!nonOverlappingPortions.isEmpty()) { var wp = new QWordPathsRenderer(nonOverlappingPortions); - concat.add(wp.render(reachability)); - } else if (!nonOverlappingPortions.isEmpty()) { - var wp = new QWordPathsRenderer(nonOverlappingPortions); - concat.add(wp.render(reachability)); + resultJoiner.add(wp.render(reachability)); } } } else if (commonality.size() > 1) { // The case where no words are common to all paths @@ -79,6 +87,7 @@ class QWordPathsRenderer { // Mutable copy of the paths List allDivergentPaths = new ArrayList<>(paths); + // Break the paths into branches by the first common word they contain, in order of decreasing commonality for (var commonWord : byCommonality) { if (allDivergentPaths.isEmpty()) break; @@ -91,10 +100,15 @@ class QWordPathsRenderer { continue; } + // Remove the common word from the path + var newPath = path.without(commonWord); + pathsByCommonWord .computeIfAbsent(commonWord, k -> new ArrayList<>()) - .add(path.without(commonWord)); // Remove the common word from the path + .add(newPath); + // Remove the path from the list of divergent paths since we've now accounted for it and + // we don't want redundant branches: iter.remove(); } } @@ -103,17 +117,17 @@ class QWordPathsRenderer { .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) // Sort by topological order to ensure consistent output .map(e -> { String commonWord = e.getKey().word(); + // Recurse into the branches: String branchPart = new QWordPathsRenderer(e.getValue()).render(reachability); return STR."\{commonWord} \{branchPart}"; }) .collect(Collectors.joining(" | ", " ( ", " ) ")); - concat.add(branches); - + resultJoiner.add(branches); } // Remove any double spaces that may have been introduced - return concat.toString().replaceAll("\\s+", " ").trim(); + return resultJoiner.toString().replaceAll("\\s+", " ").trim(); } } From 9852b0e6091f10492914f0d3e6122011b16d9f2d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 14:18:26 +0100 Subject: [PATCH 17/47] (qs, WIP) Tidy it up a bit --- .../query_parser/model/QWordGraph.java | 3 ++- .../model/QWordPathsRenderer.java | 25 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 20e4320d..272b7b35 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -225,7 +225,8 @@ public class QWordGraph implements Iterable { } public Comparator topologicalComparator() { - return Comparator.comparing(sortOrder::get); + Comparator comp = Comparator.comparing(sortOrder::get); + return comp.thenComparing(QWord::ord); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index a8e96837..762a7d1b 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -21,6 +21,13 @@ class QWordPathsRenderer { return new QWordPathsRenderer(graph).render(graph.reachability()); } + + private static String render(Collection paths, + QWordGraph.ReachabilityData reachability) + { + return new QWordPathsRenderer(paths).render(reachability); + } + /** Render the paths into a human-readable infix-style expression. *

    * This method is recursive, but the recursion depth is limited by the @@ -34,8 +41,7 @@ class QWordPathsRenderer { // Find the commonality of words in the paths - Map commonality = paths.stream().flatMap(QWordPath::stream) - .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + Map commonality = nodeCommonality(); // Break the words into two categories: those that are common to all paths, and those that are not @@ -72,10 +78,7 @@ class QWordPathsRenderer { } // Recurse into the non-overlapping portions - if (!nonOverlappingPortions.isEmpty()) { - var wp = new QWordPathsRenderer(nonOverlappingPortions); - resultJoiner.add(wp.render(reachability)); - } + resultJoiner.add(render(nonOverlappingPortions, reachability)); } } else if (commonality.size() > 1) { // The case where no words are common to all paths @@ -117,8 +120,10 @@ class QWordPathsRenderer { .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) // Sort by topological order to ensure consistent output .map(e -> { String commonWord = e.getKey().word(); + // Recurse into the branches: - String branchPart = new QWordPathsRenderer(e.getValue()).render(reachability); + String branchPart = render(e.getValue(), reachability); + return STR."\{commonWord} \{branchPart}"; }) .collect(Collectors.joining(" | ", " ( ", " ) ")); @@ -130,4 +135,10 @@ class QWordPathsRenderer { return resultJoiner.toString().replaceAll("\\s+", " ").trim(); } + /** Compute how many paths each word is part of */ + private Map nodeCommonality() { + return paths.stream().flatMap(QWordPath::stream) + .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + } + } From e596c929ac01a172b22cc373f458151f4fb07a29 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 16:37:23 +0100 Subject: [PATCH 18/47] (qs, WIP) Clean up dead code --- .../query_parser/model/QWordGraph.java | 37 ++----------------- .../query_parser/model/QWordGraphTest.java | 34 ----------------- 2 files changed, 3 insertions(+), 68 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 272b7b35..4da9a6d1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -10,7 +10,8 @@ import java.util.stream.Stream; * with a single start node and a single end node, denoted by QWord.beg() and QWord.end() respectively. *

    * Naively, every path from the start to the end node should represent a valid query variant, although in - * practice it is desirable to be clever about how to evaluate the paths, to avoid combinatorial explosion. + * practice it is desirable to be clever about how to evaluate the paths, to avoid a large number of queries + * being generated. */ public class QWordGraph implements Iterable { @@ -85,6 +86,7 @@ public class QWordGraph implements Iterable { public List links() { return Collections.unmodifiableList(links); } + public List nodes() { return links.stream() .flatMap(l -> Stream.of(l.from(), l.to())) @@ -120,39 +122,6 @@ public class QWordGraph implements Iterable { .toList(); } - // Returns true if removing the word would disconnect the graph - // so that there is no path from 'begin' to 'end'. This is useful - // in breaking up the graph into smaller component subgraphs, and - // understanding which vertexes can be re-ordered without changing - // the semantics of the encoded query. - public boolean isBypassed(QWord word, QWord begin, QWord end) { - Set edge = new HashSet<>(); - Set visited = new HashSet<>(); - - edge.add(begin); - - while (!edge.isEmpty()) { - Set next = new HashSet<>(); - - for (var w : edge) { - // Skip the word we're trying find a bypassing route for - if (w.ord() == word.ord()) - continue; - - if (Objects.equals(w, end)) - return true; - - next.addAll(getNext(w)); - } - - next.removeAll(visited); - visited.addAll(next); - edge = next; - } - - return false; - } - public Map> forwardReachability() { Map> ret = new HashMap<>(); diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index f3201b9d..9c47e980 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -9,40 +9,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; class QWordGraphTest { - @Test - public void testAddConstructor() { - QWordGraph graph = new QWordGraph("hello", "world"); - - System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); - System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); - System.out.println(graph.compileToQuery()); - graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); - graph.links().forEach(System.out::println); - System.out.println("--"); - graph.nodes().forEach(System.out::println); - System.out.println("--"); - graph.addVariant(graph.nodes().get(1), "sup"); - System.out.println(graph.compileToQuery()); - graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); - System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); - System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); - System.out.println("--"); - graph.links().forEach(System.out::println); - System.out.println("--"); - graph.nodes().forEach(System.out::println); - - graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); - graph.addVariant(graph.nodes().get(2), "globe"); - System.out.println(graph.compileToQuery()); - System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); - System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); - System.out.println("--"); - graph.links().forEach(System.out::println); - System.out.println("--"); - graph.nodes().forEach(System.out::println); - graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); - } - @Test void forwardReachability() { // Construct a graph like From 87bb93e1d4991188f831ca777a971e953d924b39 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Mar 2024 12:40:27 +0100 Subject: [PATCH 19/47] (qs, WIP) Fix edge cases in query compilation This addresses the relatively common case where the graph consists of two segments, such as x y, z w; in this case we want an output like (x_y) (z w | z_w) | x y (z_w). The generated output does somewhat pessimize a few other cases, but this one is arguably more important. --- .../query_parser/QueryExpansion.java | 13 +++++---- .../searchquery/query_parser/model/QWord.java | 4 +++ .../query_parser/model/QWordGraph.java | 14 +++++++--- .../model/QWordGraphPathLister.java | 2 +- .../model/QWordPathsRenderer.java | 27 +++++++++++-------- .../query_parser/model/QWordGraphTest.java | 2 +- 6 files changed, 41 insertions(+), 21 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index c216918e..6415751b 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -4,6 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.model.QWord; import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; +import nu.marginalia.functions.searchquery.query_parser.model.QWordPathsRenderer; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; @@ -11,6 +12,8 @@ import org.apache.commons.lang3.StringUtils; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.IntStream; public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); @@ -32,7 +35,7 @@ public class QueryExpansion { this.lexicon = lexicon; } - public QWordGraph expandQuery(List words) { + public String expandQuery(List words) { QWordGraph graph = new QWordGraph(words); @@ -40,7 +43,7 @@ public class QueryExpansion { strategy.expand(graph); } - return graph; + return QWordPathsRenderer.render(graph); } private static final Pattern dashPattern = Pattern.compile("-"); @@ -98,16 +101,16 @@ public class QueryExpansion { nodes.add(qw); } - String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { for (var segment : lexicon.findSegments(length, words)) { int start = segment.start(); int end = segment.start() + segment.length(); - var word = StringUtils.join(words, "_", start, end); + var word = IntStream.range(start, end).mapToObj(nodes::get).map(QWord::word).collect(Collectors.joining("_")); - graph.addVariantForSpan(nodes.get(start), nodes.get(end), word); + graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java index b7c4e594..eac2e68b 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java @@ -44,4 +44,8 @@ public record QWord( public QWord(int ord, QWord original, String word) { this(ord, true, ps.stemWord(word), word, original.original); } + + public String toString() { + return STR."q{\{word}}"; + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 4da9a6d1..a8b1a768 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -50,9 +50,9 @@ public class QWordGraph implements Iterable { var newWord = new QWord(wordId++, original, word); - for (var prev : getPrevOriginal(original)) + for (var prev : getPrev(original)) addLink(prev, newWord); - for (var next : getNextOriginal(original)) + for (var next : getNext(original)) addLink(newWord, next); } @@ -236,7 +236,15 @@ public class QWordGraph implements Iterable { public String compileToQuery() { return QWordPathsRenderer.render(this); } - + public String compileToDot() { + StringBuilder sb = new StringBuilder(); + sb.append("digraph {\n"); + for (var link : links) { + sb.append(STR."\"\{link.from().word()}\" -> \"\{link.to.word()}\";\n"); + } + sb.append("}\n"); + return sb.toString(); + } @NotNull @Override diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java index 979a419b..f26c01f7 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java @@ -16,7 +16,7 @@ public class QWordGraphPathLister { this.graph = graph; } - static Set listPaths(QWordGraph graph) { + public static Set listPaths(QWordGraph graph) { return new QWordGraphPathLister(graph).listPaths(); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index 762a7d1b..b1ee7956 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -6,7 +6,7 @@ import java.util.stream.Collectors; /** Renders a set of QWordPaths into a human-readable infix-style expression. It's not guaranteed to find * the globally optimal expression, but rather uses a greedy algorithm as a tradeoff in effort to outcome. */ -class QWordPathsRenderer { +public class QWordPathsRenderer { private final Set paths; private QWordPathsRenderer(Collection paths) { @@ -41,7 +41,7 @@ class QWordPathsRenderer { // Find the commonality of words in the paths - Map commonality = nodeCommonality(); + Map commonality = nodeCommonality(paths); // Break the words into two categories: those that are common to all paths, and those that are not @@ -82,32 +82,30 @@ class QWordPathsRenderer { } } else if (commonality.size() > 1) { // The case where no words are common to all paths - // Sort the words by commonality, so that we can consider the most common words first - List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); + // Sort the words by commonality, so that we can consider the most common words first Map> pathsByCommonWord = new HashMap<>(); // Mutable copy of the paths List allDivergentPaths = new ArrayList<>(paths); // Break the paths into branches by the first common word they contain, in order of decreasing commonality - for (var commonWord : byCommonality) { - if (allDivergentPaths.isEmpty()) - break; + while (!allDivergentPaths.isEmpty()) { + QWord mostCommon = mostCommonQWord(allDivergentPaths); var iter = allDivergentPaths.iterator(); while (iter.hasNext()) { var path = iter.next(); - if (!path.contains(commonWord)) { + if (!path.contains(mostCommon)) { continue; } // Remove the common word from the path - var newPath = path.without(commonWord); + var newPath = path.without(mostCommon); pathsByCommonWord - .computeIfAbsent(commonWord, k -> new ArrayList<>()) + .computeIfAbsent(mostCommon, k -> new ArrayList<>()) .add(newPath); // Remove the path from the list of divergent paths since we've now accounted for it and @@ -136,9 +134,16 @@ class QWordPathsRenderer { } /** Compute how many paths each word is part of */ - private Map nodeCommonality() { + private static Map nodeCommonality(Collection paths) { return paths.stream().flatMap(QWordPath::stream) .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); } + private static QWord mostCommonQWord(Collection paths) { + assert !paths.isEmpty(); + return nodeCommonality(paths).entrySet().stream() + .max(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .orElseThrow(); + } } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index 9c47e980..f985cd13 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -110,6 +110,6 @@ class QWordGraphTest { QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("c"), "d"); graph.addVariant(graph.node("b"), "e"); - assertEquals("a ( b ( c | d ) | c e )", graph.compileToQuery()); + assertEquals("a ( c ( b | e ) | d ( b | e ) )", graph.compileToQuery()); } } \ No newline at end of file From 81815f3e0a9695e7cd8ec6d243a8311369ae2bbf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Apr 2024 20:17:58 +0200 Subject: [PATCH 20/47] (qs, index) New query model integrated with index service. Seems to work, tests are green and initial testing finds no errors. Still a bit untested, committing WIP as-is because it would suck to lose weeks of work due to a drive failure or something. --- code/functions/search-query/api/build.gradle | 1 + .../api/searchquery/IndexProtobufCodec.java | 37 ++-- .../api/searchquery/QueryProtobufCodec.java | 21 +-- .../model/compiled/CompiledQuery.java | 76 ++++++++ .../model/compiled/CompiledQueryLong.java | 42 +++++ .../model/compiled/CompiledQueryParser.java | 113 ++++++++++++ .../searchquery/model/compiled/CqData.java | 51 ++++++ .../model/compiled/CqDataLong.java | 27 +++ .../model/compiled/CqExpression.java | 170 ++++++++++++++++++ .../aggregate/CompiledQueryAggregates.java | 46 +++++ .../aggregate/CqBooleanAggregate.java | 40 +++++ .../aggregate/CqDoubleSumOperator.java | 40 +++++ .../aggregate/CqIntMaxMinOperator.java | 41 +++++ .../aggregate/CqLongBitmaskOperator.java | 40 +++++ .../aggregate/CqQueryPathsOperator.java | 75 ++++++++ .../model/query/QueryResponse.java | 6 +- .../{SearchSubquery.java => SearchQuery.java} | 26 +-- .../model/query/SearchSpecification.java | 2 +- .../model/results/SearchResultItem.java | 4 +- .../results/SearchResultKeywordScore.java | 9 +- .../api/src/main/protobuf/query-api.proto | 16 +- .../compiled/CompiledQueryParserTest.java | 79 ++++++++ .../CompiledQueryAggregatesTest.java | 35 ++++ .../index/client/IndexProtobufCodecTest.java | 7 +- .../searchquery/svc/QueryFactory.java | 50 +++--- .../svc/QuerySearchTermsAccumulator.java | 8 +- .../query/svc/QueryFactoryTest.java | 33 ++-- .../index/ReverseIndexEntrySource.java | 2 +- .../nu/marginalia/index/IndexGrpcService.java | 32 ++-- .../index/index/CombinedIndexReader.java | 7 + .../index/index/IndexQueryBuilderImpl.java | 16 ++ .../index/index/QueryBranchWalker.java | 78 ++++++++ .../marginalia/index/index/StatefulIndex.java | 105 +++++++---- .../index/model/SearchParameters.java | 27 +-- .../marginalia/index/model/SearchTerms.java | 26 ++- .../index/model/SearchTermsUtil.java | 20 --- .../index/results/IndexMetadataService.java | 43 ++--- .../results/IndexResultValuationContext.java | 109 +++++------ .../results/IndexResultValuatorService.java | 28 +-- .../ranking/results/ResultValuator.java | 36 ++-- .../ranking/results/factors/Bm25Factor.java | 29 ++- .../results/factors/TermCoherenceFactor.java | 17 +- .../index/query/IndexQueryBuilder.java | 3 + .../index/query/filter/QueryFilterAllOf.java | 57 ++++++ .../index/query/filter/QueryFilterAnyOf.java | 35 ++-- .../query/filter/QueryFilterLetThrough.java | 2 +- .../index/query/filter/QueryFilterNoPass.java | 2 +- .../QueryFilterStepExcludeFromPredicate.java | 2 +- .../filter/QueryFilterStepFromPredicate.java | 2 +- .../query/filter/QueryFilterStepIfTest.java | 26 +++ ...IndexQueryServiceIntegrationSmokeTest.java | 22 ++- .../IndexQueryServiceIntegrationTest.java | 106 ++++++----- .../index/index/QueryBranchWalkerTest.java | 59 ++++++ .../IndexResultDomainDeduplicatorTest.java | 5 +- .../ranking/results/ResultValuatorTest.java | 18 +- .../factors/TermCoherenceFactorTest.java | 19 +- .../marginalia/array/algo/LongArrayBase.java | 8 + .../array/buffer/LongQueryBuffer.java | 43 +++-- .../array/algo/LongArraySearchTest.java | 4 +- .../java/nu/marginalia/btree/BTreeReader.java | 4 +- .../BTreeReaderRejectRetainWithIndexTest.java | 6 +- ...reeReaderRejectRetainWithoutIndexTest.java | 7 +- .../search/SearchQueryParamFactory.java | 4 +- .../search/command/SearchAdtechParameter.java | 4 +- .../search/command/SearchJsParameter.java | 4 +- .../search/model/SearchProfile.java | 4 +- 66 files changed, 1613 insertions(+), 503 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java rename code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/{SearchSubquery.java => SearchQuery.java} (76%) create mode 100644 code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java create mode 100644 code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java create mode 100644 code/index/java/nu/marginalia/index/index/QueryBranchWalker.java create mode 100644 code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java create mode 100644 code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index 727b5b86..1a8d55d2 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation libs.notnull implementation libs.guice implementation libs.gson + implementation libs.commons.lang3 implementation libs.bundles.protobuf implementation libs.bundles.grpc implementation libs.fastutil diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 4b2f0032..4d2cf7a6 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -1,7 +1,6 @@ package nu.marginalia.api.searchquery; -import nu.marginalia.api.searchquery.*; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; @@ -45,33 +44,37 @@ public class IndexProtobufCodec { .build(); } - public static SearchSubquery convertSearchSubquery(RpcSubquery subquery) { + public static SearchQuery convertRpcQuery(RpcQuery query) { List> coherences = new ArrayList<>(); - for (int j = 0; j < subquery.getCoherencesCount(); j++) { - var coh = subquery.getCoherences(j); + for (int j = 0; j < query.getCoherencesCount(); j++) { + var coh = query.getCoherences(j); coherences.add(new ArrayList<>(coh.getCoherencesList())); } - return new SearchSubquery( - subquery.getIncludeList(), - subquery.getExcludeList(), - subquery.getAdviceList(), - subquery.getPriorityList(), + return new SearchQuery( + query.getCompiledQuery(), + query.getIncludeList(), + query.getExcludeList(), + query.getAdviceList(), + query.getPriorityList(), coherences ); } - public static RpcSubquery convertSearchSubquery(SearchSubquery searchSubquery) { + public static RpcQuery convertRpcQuery(SearchQuery searchQuery) { var subqueryBuilder = - RpcSubquery.newBuilder() - .addAllAdvice(searchSubquery.getSearchTermsAdvice()) - .addAllExclude(searchSubquery.getSearchTermsExclude()) - .addAllInclude(searchSubquery.getSearchTermsInclude()) - .addAllPriority(searchSubquery.getSearchTermsPriority()); - for (var coherences : searchSubquery.searchTermCoherences) { + RpcQuery.newBuilder() + .setCompiledQuery(searchQuery.compiledQuery) + .addAllInclude(searchQuery.getSearchTermsInclude()) + .addAllAdvice(searchQuery.getSearchTermsAdvice()) + .addAllExclude(searchQuery.getSearchTermsExclude()) + .addAllPriority(searchQuery.getSearchTermsPriority()); + + for (var coherences : searchQuery.searchTermCoherences) { subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences); } + return subqueryBuilder.build(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 28d14c82..f0113870 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -14,7 +13,6 @@ import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryResponse; import java.util.ArrayList; -import java.util.List; public class QueryProtobufCodec { @@ -23,9 +21,7 @@ public class QueryProtobufCodec { builder.addAllDomains(request.getDomainIdsList()); - for (var subquery : query.specs.subqueries) { - builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery)); - } + builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query)); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setHumanQuery(request.getHumanQuery()); @@ -51,9 +47,7 @@ public class QueryProtobufCodec { public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) { var builder = RpcIndexQuery.newBuilder(); - for (var subquery : query.specs.subqueries) { - builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery)); - } + builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query)); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setHumanQuery(humanQuery); @@ -147,8 +141,8 @@ public class QueryProtobufCodec { private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) { return new SearchResultKeywordScore( - keywordScores.getSubquery(), keywordScores.getKeyword(), + -1, // termId is internal to index service keywordScores.getEncodedWordMetadata(), keywordScores.getEncodedDocMetadata(), keywordScores.getHtmlFeatures() @@ -156,14 +150,8 @@ public class QueryProtobufCodec { } private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) { - List subqueries = new ArrayList<>(specs.getSubqueriesCount()); - - for (int i = 0; i < specs.getSubqueriesCount(); i++) { - subqueries.add(IndexProtobufCodec.convertSearchSubquery(specs.getSubqueries(i))); - } - return new SearchSpecification( - subqueries, + IndexProtobufCodec.convertRpcQuery(specs.getQuery()), specs.getDomainsList(), specs.getSearchSetIdentifier(), specs.getHumanQuery(), @@ -182,7 +170,6 @@ public class QueryProtobufCodec { .addAllDomainIds(params.domainIds()) .addAllTacitAdvice(params.tacitAdvice()) .addAllTacitExcludes(params.tacitExcludes()) - .addAllTacitIncludes(params.tacitIncludes()) .addAllTacitPriority(params.tacitPriority()) .setHumanQuery(params.humanQuery()) .setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits())) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java new file mode 100644 index 00000000..3ae850a3 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -0,0 +1,76 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.function.*; +import java.util.stream.IntStream; +import java.util.stream.Stream; + + +/** A compiled index service query. The class separates the topology of the query from the data, + * and it's possible to create new queries supplanting the data */ +public class CompiledQuery implements Iterable { + + /** The root expression, conveys the topology of the query */ + public final CqExpression root; + + private final CqData data; + + public CompiledQuery(CqExpression root, CqData data) { + this.root = root; + this.data = data; + } + + public CompiledQuery(CqExpression root, T[] data) { + this.root = root; + this.data = new CqData<>(data); + } + + /** Exists for testing, creates a simple query that ANDs all the provided items */ + public static CompiledQuery just(T... item) { + return new CompiledQuery<>(new CqExpression.And( + IntStream.range(0, item.length).mapToObj(CqExpression.Word::new).toList() + ), item); + } + + /** Create a new CompiledQuery mapping the leaf nodes using the provided mapper */ + public CompiledQuery map(Class clazz, Function mapper) { + return new CompiledQuery<>( + root, + data.map(clazz, mapper) + ); + } + + public CompiledQueryLong mapToLong(ToLongFunction mapper) { + return new CompiledQueryLong(root, data.mapToLong(mapper)); + } + + public CqExpression root() { + return root; + } + + public Stream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public T at(int index) { + return data.get(index); + } + + @NotNull + @Override + public Iterator iterator() { + return stream().iterator(); + } + + public int size() { + return data.size(); + } + + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java new file mode 100644 index 00000000..639778dc --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -0,0 +1,42 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + + +/** A compiled index service query */ +public class CompiledQueryLong implements Iterable { + private final CqExpression root; + private final CqDataLong data; + + public CompiledQueryLong(CqExpression root, CqDataLong data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public LongStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + @NotNull + @Override + public Iterator iterator() { + return stream().iterator(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java new file mode 100644 index 00000000..ae197fb9 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java @@ -0,0 +1,113 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.apache.commons.lang3.StringUtils; + +import java.util.*; + +/** Parser for a compiled index query */ +public class CompiledQueryParser { + + public static CompiledQuery parse(String query) { + List parts = tokenize(query); + + if (parts.isEmpty()) { + return new CompiledQuery<>( + CqExpression.empty(), + new CqData<>(new String[0]) + ); + } + + // We aren't interested in a binary tree representation, but an n-ary tree one, + // so a somewhat unusual parsing technique is used to avoid having an additional + // flattening step at the end. + + // This is only possible due to the trivial and unambiguous grammar of the compiled queries + + List parenState = new ArrayList<>(); + parenState.add(new AndOrState()); + + Map wordIds = new HashMap<>(); + + for (var part : parts) { + var head = parenState.getLast(); + + if (part.equals("|")) { + head.or(); + } + else if (part.equals("(")) { + parenState.addLast(new AndOrState()); + } + else if (part.equals(")")) { + if (parenState.size() < 2) { + throw new IllegalStateException("Mismatched parentheses in expression: " + query); + } + parenState.removeLast(); + parenState.getLast().and(head.closeOr()); + } + else { + head.and( + new CqExpression.Word( + wordIds.computeIfAbsent(part, p -> wordIds.size()) + ) + ); + } + } + + if (parenState.size() != 1) + throw new IllegalStateException("Mismatched parentheses in expression: " + query); + + // Construct the CompiledQuery object with String:s as leaves + var root = parenState.getLast().closeOr(); + + String[] cqData = new String[wordIds.size()]; + wordIds.forEach((w, i) -> cqData[i] = w); + return new CompiledQuery<>(root, new CqData<>(cqData)); + + } + + private static class AndOrState { + private List andState = new ArrayList<>(); + private List orState = new ArrayList<>(); + + /** Add a new item to the and-list */ + public void and(CqExpression e) { + andState.add(e); + } + + /** Turn the and-list into an expression on the or-list, and then start a new and-list */ + public void or() { + closeAnd(); + + andState = new ArrayList<>(); + } + + /** Turn the and-list into an And-expression in the or-list */ + private void closeAnd() { + if (andState.size() == 1) + orState.add(andState.getFirst()); + else if (!andState.isEmpty()) + orState.add(new CqExpression.And(andState)); + } + + /** Finalize the current and-list, then turn the or-list into an Or-expression */ + public CqExpression closeOr() { + closeAnd(); + + if (orState.isEmpty()) + return CqExpression.empty(); + if (orState.size() == 1) + return orState.getFirst(); + + return new CqExpression.Or(orState); + } + } + + private static List tokenize(String query) { + // Each token is guaranteed to be separated by one or more space characters + + return Arrays.stream(StringUtils.split(query, ' ')) + .filter(StringUtils::isNotBlank) + .toList(); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java new file mode 100644 index 00000000..b1565dc0 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -0,0 +1,51 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.function.Function; +import java.util.function.ToDoubleFunction; +import java.util.function.ToLongFunction; +import java.util.stream.Stream; + +public class CqData { + private final T[] data; + + public CqData(T[] data) { + this.data = data; + } + + @SuppressWarnings("unchecked") + public CqData map(Class clazz, Function mapper) { + T2[] newData = (T2[]) Array.newInstance(clazz, data.length); + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.apply((T) data[i]); + } + + return new CqData<>(newData); + } + + public CqDataLong mapToLong(ToLongFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsLong((T) data[i]); + } + + return new CqDataLong(newData); + } + + public T get(int i) { + return data[i]; + } + + public T get(CqExpression.Word w) { + return data[w.idx()]; + } + + public Stream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java new file mode 100644 index 00000000..8049631e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java @@ -0,0 +1,27 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.LongStream; + +public class CqDataLong { + private final long[] data; + + public CqDataLong(long[] data) { + this.data = data; + } + + public long get(int i) { + return data[i]; + } + public long get(CqExpression.Word w) { + return data[w.idx()]; + } + + public LongStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java new file mode 100644 index 00000000..e9972526 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java @@ -0,0 +1,170 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.List; +import java.util.StringJoiner; +import java.util.stream.Stream; + +/** Expression in a parsed index service query + * + */ +public sealed interface CqExpression { + + Stream stream(); + + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + long visit(LongVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + double visit(DoubleVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + int visit(IntVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + boolean visit(BoolVisitor visitor); + + T visit(ObjectVisitor visitor); + + static CqExpression empty() { + return new Or(List.of()); + } + + + record And(List parts) implements CqExpression { + @Override + public Stream stream() { + return parts.stream().flatMap(CqExpression::stream); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onAnd(parts); } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "And[ ", "]"); + parts.forEach(part -> sj.add(part.toString())); + return sj.toString(); + } + + } + + record Or(List parts) implements CqExpression { + @Override + public Stream stream() { + return parts.stream().flatMap(CqExpression::stream); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onOr(parts); } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "Or[ ", "]"); + parts.forEach(part -> sj.add(part.toString())); + return sj.toString(); + } + + + } + + record Word(int idx) implements CqExpression { + @Override + public Stream stream() { + return Stream.of(this); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onLeaf(idx); } + + @Override + public String toString() { + return Integer.toString(idx); + } + } + + interface LongVisitor { + long onAnd(List parts); + long onOr(List parts); + long onLeaf(int idx); + } + + interface IntVisitor { + int onAnd(List parts); + int onOr(List parts); + int onLeaf(int idx); + } + + interface BoolVisitor { + boolean onAnd(List parts); + boolean onOr(List parts); + boolean onLeaf(int idx); + } + + interface DoubleVisitor { + double onAnd(List parts); + double onOr(List parts); + double onLeaf(int idx); + } + + interface ObjectVisitor { + T onAnd(List parts); + T onOr(List parts); + T onLeaf(int idx); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java new file mode 100644 index 00000000..209acbee --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -0,0 +1,46 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.*; + +public class CompiledQueryAggregates { + /** Compiled query aggregate that for a single boolean that treats or-branches as logical OR, + * and and-branches as logical AND operations. Will return true if there exists a path through + * the query where the provided predicate returns true for each item. + */ + static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } + + + /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, + * and and-branches as logical AND operations. + */ + public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } + + + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + + /** Apply the operator to each leaf node, and then return the highest sum of values possible + * through each branch in the compiled query. + * + */ + public static double doubleSumAggregate(CompiledQuery query, ToDoubleFunction operator) { + return query.root.visit(new CqDoubleSumOperator(query, operator)); + } + + /** Enumerate all possible paths through the compiled query */ + public static List queriesAggregate(CompiledQueryLong query) { + return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java new file mode 100644 index 00000000..05ebf4c7 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntPredicate; +import java.util.function.Predicate; + +public class CqBooleanAggregate implements CqExpression.BoolVisitor { + + private final IntPredicate predicate; + + public CqBooleanAggregate(CompiledQuery query, Predicate objPred) { + this.predicate = idx -> objPred.test(query.at(idx)); + } + + @Override + public boolean onAnd(List parts) { + for (var part : parts) { + if (!part.visit(this)) // short-circuit + return false; + } + return true; + } + + @Override + public boolean onOr(List parts) { + for (var part : parts) { + if (part.visit(this)) // short-circuit + return true; + } + return false; + } + + @Override + public boolean onLeaf(int idx) { + return predicate.test(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java new file mode 100644 index 00000000..23d1904e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToDoubleFunction; +import java.util.function.ToDoubleFunction; + +public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { + + private final IntToDoubleFunction operator; + + public CqDoubleSumOperator(CompiledQuery query, ToDoubleFunction operator) { + this.operator = idx -> operator.applyAsDouble(query.at(idx)); + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.max(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + return operator.applyAsDouble(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java new file mode 100644 index 00000000..b3ec86bb --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -0,0 +1,41 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntUnaryOperator; +import java.util.function.ToIntFunction; + +public class CqIntMaxMinOperator implements CqExpression.IntVisitor { + + private final IntUnaryOperator operator; + + + public CqIntMaxMinOperator(CompiledQuery query, ToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + + @Override + public int onAnd(List parts) { + int value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.min(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public int onOr(List parts) { + int value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.max(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public int onLeaf(int idx) { + return operator.applyAsInt(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java new file mode 100644 index 00000000..d9a4804b --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToLongFunction; +import java.util.function.ToLongFunction; + +public class CqLongBitmaskOperator implements CqExpression.LongVisitor { + + private final IntToLongFunction operator; + + public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } + + @Override + public long onAnd(List parts) { + long value = ~0L; + for (var part : parts) { + value &= part.visit(this); + } + return value; + } + + @Override + public long onOr(List parts) { + long value = 0L; + for (var part : parts) { + value |= part.visit(this); + } + return value; + } + + @Override + public long onLeaf(int idx) { + return operator.applyAsLong(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java new file mode 100644 index 00000000..2339104e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java @@ -0,0 +1,75 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.ArrayList; +import java.util.List; + +public class CqQueryPathsOperator implements CqExpression.ObjectVisitor> { + private final CompiledQueryLong query; + + public CqQueryPathsOperator(CompiledQueryLong query) { + this.query = query; + } + + @Override + public List onAnd(List parts) { + return parts.stream() + .map(expr -> expr.visit(this)) + .reduce(List.of(), this::combineAnd); + } + + private List combineAnd(List a, List b) { + // No-op cases + if (a.isEmpty()) + return b; + if (b.isEmpty()) + return a; + + // Simple cases + if (a.size() == 1) { + b.forEach(set -> set.addAll(a.getFirst())); + return b; + } + else if (b.size() == 1) { + a.forEach(set -> set.addAll(b.getFirst())); + return a; + } + + // Case where we AND two ORs + List ret = new ArrayList<>(); + + for (var aPart : a) { + for (var bPart : b) { + LongSet set = new LongOpenHashSet(aPart.size() + bPart.size()); + set.addAll(aPart); + set.addAll(bPart); + ret.add(set); + } + } + + return ret; + } + + @Override + public List onOr(List parts) { + List ret = new ArrayList<>(); + + for (var part : parts) { + ret.addAll(part.visit(this)); + } + + return ret; + } + + @Override + public List onLeaf(int idx) { + var set = new LongArraySet(1); + set.add(query.at(idx)); + return List.of(set); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java index 80e5b61a..1834c08f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java @@ -13,10 +13,6 @@ public record QueryResponse(SearchSpecification specs, String domain) { public Set getAllKeywords() { - Set keywords = new HashSet<>(100); - for (var sq : specs.subqueries) { - keywords.addAll(sq.searchTermsInclude); - } - return keywords; + return new HashSet<>(specs.query.searchTermsInclude); } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java similarity index 76% rename from code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java rename to code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index 3798ae89..9dd10396 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -13,9 +13,12 @@ import java.util.stream.Collectors; @AllArgsConstructor @With @EqualsAndHashCode -public class SearchSubquery { +public class SearchQuery { - /** These terms must be present in the document and are used in ranking*/ + /** An infix style expression that encodes the required terms in the query */ + public final String compiledQuery; + + /** All terms that appear in {@see compiledQuery} */ public final List searchTermsInclude; /** These terms must be absent from the document */ @@ -33,7 +36,8 @@ public class SearchSubquery { @Deprecated // why does this exist? private double value = 0; - public SearchSubquery() { + public SearchQuery() { + this.compiledQuery = ""; this.searchTermsInclude = new ArrayList<>(); this.searchTermsExclude = new ArrayList<>(); this.searchTermsAdvice = new ArrayList<>(); @@ -41,11 +45,13 @@ public class SearchSubquery { this.searchTermCoherences = new ArrayList<>(); } - public SearchSubquery(List searchTermsInclude, - List searchTermsExclude, - List searchTermsAdvice, - List searchTermsPriority, - List> searchTermCoherences) { + public SearchQuery(String compiledQuery, + List searchTermsInclude, + List searchTermsExclude, + List searchTermsAdvice, + List searchTermsPriority, + List> searchTermCoherences) { + this.compiledQuery = compiledQuery; this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; @@ -54,7 +60,7 @@ public class SearchSubquery { } @Deprecated // why does this exist? - public SearchSubquery setValue(double value) { + public SearchQuery setValue(double value) { if (Double.isInfinite(value) || Double.isNaN(value)) { this.value = Double.MAX_VALUE; } else { @@ -66,7 +72,7 @@ public class SearchSubquery { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!searchTermsInclude.isEmpty()) sb.append("include=").append(searchTermsInclude.stream().collect(Collectors.joining(",", "[", "] "))); + if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java index be2a6895..bbb5b7ae 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java @@ -10,7 +10,7 @@ import java.util.List; @ToString @Getter @Builder @With @AllArgsConstructor public class SearchSpecification { - public List subqueries; + public SearchQuery query; /** If present and not empty, limit the search to these domain IDs */ public List domains; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index cc02ae28..8f50c9fb 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -21,9 +21,9 @@ public class SearchResultItem implements Comparable { /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId, int scoresCount) { + public SearchResultItem(long combinedId) { this.combinedId = combinedId; - this.keywordScores = new ArrayList<>(scoresCount); + this.keywordScores = new ArrayList<>(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index b84dad0b..f5a9fc02 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -7,19 +7,22 @@ import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; public final class SearchResultKeywordScore { + @Deprecated public final int subquery; + public final long termId; public final String keyword; private final long encodedWordMetadata; private final long encodedDocMetadata; private final int htmlFeatures; - public SearchResultKeywordScore(int subquery, - String keyword, + public SearchResultKeywordScore(String keyword, + long termId, long encodedWordMetadata, long encodedDocMetadata, int htmlFeatures) { - this.subquery = subquery; + this.termId = termId; + this.subquery = -1; // FIXME, deprecated this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; this.encodedDocMetadata = encodedDocMetadata; diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index f5ec5e8d..606b18f8 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -52,7 +52,7 @@ message RpcTemporalBias { /* Index service query request */ message RpcIndexQuery { - repeated RpcSubquery subqueries = 1; + RpcQuery query = 1; repeated int32 domains = 2; // (optional) A list of domain IDs to consider string searchSetIdentifier = 3; // (optional) A named set of domains to consider string humanQuery = 4; // The search query as the user entered it @@ -102,12 +102,11 @@ message RpcRawResultItem { /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { - int32 subquery = 1; // index of the subquery this keyword relates to - string keyword = 2; // the keyword - int64 encodedWordMetadata = 3; // bit encoded word metadata - int64 encodedDocMetadata = 4; // bit encoded document metadata - bool hasPriorityTerms = 5; // true if this word is important to the document - int32 htmlFeatures = 6; // bit encoded document features + string keyword = 1; // the keyword + int64 encodedWordMetadata = 2; // bit encoded word metadata + int64 encodedDocMetadata = 3; // bit encoded document metadata + bool hasPriorityTerms = 4; // true if this word is important to the document + int32 htmlFeatures = 5; // bit encoded document features } /* Query execution parameters */ @@ -137,12 +136,13 @@ message RpcResultRankingParameters { } /* Defines a single subquery */ -message RpcSubquery { +message RpcQuery { repeated string include = 1; // These terms must be present repeated string exclude = 2; // These terms must be absent repeated string advice = 3; // These terms must be present, but do not affect ranking repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other + string compiledQuery = 6; // Compiled query in infix notation } /* Defines a group of search terms that must exist in close proximity within the document */ diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java new file mode 100644 index 00000000..47983820 --- /dev/null +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java @@ -0,0 +1,79 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class CompiledQueryParserTest { + + @Test + public void testEmpty() { + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( )").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( | )").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("| ( | ) |").root); + } + + @Test + public void testSingleWord() { + CompiledQuery q = CompiledQueryParser.parse("foo"); + assertEquals(w(q, "foo"), q.root); + } + + @Test + public void testAndTwoWords() { + CompiledQuery q = CompiledQueryParser.parse("foo bar"); + assertEquals(and(w(q, "foo"), w(q,"bar")), q.root); + } + + @Test + public void testOrTwoWords() { + CompiledQuery q = CompiledQueryParser.parse("foo | bar"); + assertEquals(or(w(q, "foo"), w(q,"bar")), q.root); + } + + @Test + public void testOrAndWords() { + CompiledQuery q = CompiledQueryParser.parse("foo | bar baz"); + assertEquals(or(w(q,"foo"), and(w(q,"bar"), w(q,"baz"))), q.root); + } + + @Test + public void testAndAndOrAndAndWords() { + CompiledQuery q = CompiledQueryParser.parse("foo foobar | bar baz"); + assertEquals(or( + and(w(q, "foo"), w(q, "foobar")), + and(w(q, "bar"), w(q, "baz"))) + , q.root); + } + @Test + public void testComplex1() { + CompiledQuery q = CompiledQueryParser.parse("foo ( bar | baz ) quux"); + assertEquals(and(w(q,"foo"), or(w(q, "bar"), w(q, "baz")), w(q, "quux")), q.root); + } + @Test + public void testComplex2() { + CompiledQuery q = CompiledQueryParser.parse("( ( ( a ) b ) c ) d"); + assertEquals(and(and(and(w(q, "a"), w(q, "b")), w(q, "c")), w(q, "d")), q.root); + } + + @Test + public void testNested() { + CompiledQuery q = CompiledQueryParser.parse("( ( ( a ) ) )"); + assertEquals(w(q,"a"), q.root); + } + + private CqExpression.Word w(CompiledQuery query, String word) { + return new CqExpression.Word(query.indices().filter(idx -> word.equals(query.at(idx))).findAny().orElseThrow()); + } + + private CqExpression and(CqExpression... parts) { + return new CqExpression.And(List.of(parts)); + } + + private CqExpression or(CqExpression... parts) { + return new CqExpression.Or(List.of(parts)); + } +} \ No newline at end of file diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java new file mode 100644 index 00000000..c3e36180 --- /dev/null +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java @@ -0,0 +1,35 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import static nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser.parse; +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class CompiledQueryAggregatesTest { + + @Test + void booleanAggregates() { + assertFalse(booleanAggregate(parse("false"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("true"), Boolean::parseBoolean)); + assertFalse(booleanAggregate(parse("false true"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( true ) | ( true false )"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( false ) | ( true )"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( true false ) | ( true true )"), Boolean::parseBoolean)); + assertFalse(booleanAggregate(parse("( true false ) | ( true false )"), Boolean::parseBoolean)); + } + + @Test + void intMaxMinAggregates() { + assertEquals(5, intMaxMinAggregate(parse("5"), Integer::parseInt)); + assertEquals(3, intMaxMinAggregate(parse("5 3"), Integer::parseInt)); + assertEquals(6, intMaxMinAggregate(parse("5 3 | 6 7"), Integer::parseInt)); + } + + @Test + void doubleSumAggregates() { + assertEquals(5, (int) doubleSumAggregate(parse("5"), Double::parseDouble)); + assertEquals(8, (int) doubleSumAggregate(parse("5 3"), Double::parseDouble)); + assertEquals(13, (int) doubleSumAggregate(parse("1 ( 5 3 | 2 10 )"), Double::parseDouble)); + } +} \ No newline at end of file diff --git a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java index 1782765d..e93f715c 100644 --- a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java +++ b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java @@ -1,7 +1,7 @@ package nu.marginalia.index.client; import nu.marginalia.api.searchquery.IndexProtobufCodec; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -35,14 +35,15 @@ class IndexProtobufCodecTest { } @Test public void testSubqery() { - verifyIsIdentityTransformation(new SearchSubquery( + verifyIsIdentityTransformation(new SearchQuery( + "qs", List.of("a", "b"), List.of("c", "d"), List.of("e", "f"), List.of("g", "h"), List.of(List.of("i", "j"), List.of("k")) ), - s -> IndexProtobufCodec.convertSearchSubquery(IndexProtobufCodec.convertSearchSubquery(s)) + s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s)) ); } private void verifyIsIdentityTransformation(T val, Function transformation) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 3c0e5219..55467b4f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -2,18 +2,16 @@ package nu.marginalia.functions.searchquery.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.LanguageModels; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.util.language.EnglishDictionary; +import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,15 +24,14 @@ import java.util.List; public class QueryFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final int RETAIN_QUERY_VARIANT_COUNT = 5; private final QueryParser queryParser = new QueryParser(); + private final QueryExpansion queryExpansion; @Inject - public QueryFactory(LanguageModels lm, - TermFrequencyDict dict, - EnglishDictionary englishDictionary) + public QueryFactory(QueryExpansion queryExpansion) { + this.queryExpansion = queryExpansion; } @@ -49,8 +46,6 @@ public class QueryFactory { List searchTermsHuman = new ArrayList<>(); List problems = new ArrayList<>(); - String domain = null; - List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { @@ -74,19 +69,8 @@ public class QueryFactory { t.visit(qualityLimits); } -// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); - List subqueries = new ArrayList<>(); QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); - domain = termsAccumulator.domain; - -// for (var parts : queryPermutations) { -// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); -// -// domain = termsAccumulator.domain; -// -// SearchSubquery subquery = termsAccumulator.createSubquery(); -// subqueries.add(subquery); -// } + String domain = termsAccumulator.domain; List domainIds = params.domainIds(); @@ -97,7 +81,18 @@ public class QueryFactory { } var specsBuilder = SearchSpecification.builder() - .subqueries(subqueries) + .query( + new SearchQuery( + queryExpansion.expandQuery( + termsAccumulator.searchTermsInclude + ), + termsAccumulator.searchTermsInclude, + termsAccumulator.searchTermsExclude, + termsAccumulator.searchTermsAdvice, + termsAccumulator.searchTermsPriority, + termsAccumulator.searchTermCoherences + ) + ) .humanQuery(query) .quality(qualityLimits.qualityLimit) .year(qualityLimits.year) @@ -111,12 +106,9 @@ public class QueryFactory { SearchSpecification specs = specsBuilder.build(); - for (var sq : specs.subqueries) { - sq.searchTermsAdvice.addAll(params.tacitAdvice()); - sq.searchTermsPriority.addAll(params.tacitPriority()); - sq.searchTermsInclude.addAll(params.tacitIncludes()); - sq.searchTermsExclude.addAll(params.tacitExcludes()); - } + specs.query.searchTermsAdvice.addAll(params.tacitAdvice()); + specs.query.searchTermsPriority.addAll(params.tacitPriority()); + specs.query.searchTermsExclude.addAll(params.tacitExcludes()); return new ProcessedQuery(specs, searchTermsHuman, domain); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java index e4def0d0..cc3a7e56 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java @@ -1,6 +1,6 @@ package nu.marginalia.functions.searchquery.svc; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.language.WordPatterns; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; @@ -9,7 +9,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -/** @see SearchSubquery */ +/** @see SearchQuery */ public class QuerySearchTermsAccumulator implements TokenVisitor { public List searchTermsExclude = new ArrayList<>(); public List searchTermsInclude = new ArrayList<>(); @@ -19,10 +19,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor { public String domain; - public SearchSubquery createSubquery() { - return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); - } - public QuerySearchTermsAccumulator(List parts) { for (Token t : parts) { t.visit(this); diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 24131143..132944c4 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -3,12 +3,13 @@ package nu.marginalia.query.svc; import nu.marginalia.WmsaHome; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.functions.searchquery.svc.QueryFactory; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; -import nu.marginalia.util.language.EnglishDictionary; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; @@ -27,11 +28,9 @@ public class QueryFactoryTest { public static void setUpAll() throws IOException { var lm = WmsaHome.getLanguageModels(); - var tfd = new TermFrequencyDict(lm); - queryFactory = new QueryFactory(lm, - tfd, - new EnglishDictionary(tfd) + queryFactory = new QueryFactory( + new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm)) ); } @@ -112,17 +111,15 @@ public class QueryFactoryTest { { // the is a stopword, so it should generate an ngram search term var specs = parseAndGetSpecs("\"the shining\""); - assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude); - assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice); - assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences); + assertEquals("the_shining", specs.query.compiledQuery); } { // tde isn't a stopword, so we should get the normal behavior var specs = parseAndGetSpecs("\"tde shining\""); - assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude); - assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice); - assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences); + assertEquals("tde shining", specs.query.compiledQuery); + assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice); + assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences); } } @@ -150,8 +147,18 @@ public class QueryFactoryTest { @Test public void testPriorityTerm() { - var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next(); + var subquery = parseAndGetSpecs("physics ?tld:edu").query; assertEquals(List.of("tld:edu"), subquery.searchTermsPriority); - assertEquals(List.of("physics"), subquery.searchTermsInclude); + assertEquals("physics", subquery.compiledQuery); + } + + @Test + public void testExpansion() { + + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("elden ring mechanical keyboard slackware linux duke nukem 3d").query; + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery.compiledQuery); + } } \ No newline at end of file diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java index 37c79941..7c12563b 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java @@ -46,7 +46,7 @@ public class ReverseIndexEntrySource implements EntrySource { return; for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) { - buffer.data[wi] = buffer.data[ri]; + buffer.data.set(wi, buffer.data.get(ri)); } buffer.end /= entrySize; diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index a47c4684..b675f749 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -9,14 +9,14 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.longs.LongArrayList; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchTerms; -import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.results.IndexResultValuatorService; @@ -143,7 +143,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) .setHtmlFeatures(score.htmlFeatures()) - .setSubquery(score.subquery) ); } @@ -203,7 +202,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.subqueries); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, + params.compiledQuery, + params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -255,14 +256,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { /** Execute a search query */ public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException { - for (var subquery : parameters.subqueries) { - var terms = new SearchTerms(subquery); - if (terms.isEmpty()) - continue; + var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds); - for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { - workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); - } + for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { + workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); } for (int i = 0; i < indexValuationThreads; i++) { @@ -327,7 +324,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { buffer.reset(); query.getMoreResults(buffer); - results.addElements(0, buffer.data, 0, buffer.end); + for (int i = 0; i < buffer.end; i++) { + results.add(buffer.data.get(i)); + } if (results.size() < 512) { enqueueResults(new CombinedDocIdList(results)); @@ -413,8 +412,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } - private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List subqueries) { - final var termToId = SearchTermsUtil.getAllIncludeTerms(subqueries); + private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, + CompiledQuery query, + CompiledQueryLong compiledQueryIds) + { + Map termToId = new HashMap<>(query.size()); + query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); + final Map termFrequencies = new HashMap<>(termToId.size()); final Map prioFrequencies = new HashMap<>(termToId.size()); diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index ea78739c..3846bad8 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -38,6 +38,13 @@ public class CombinedIndexReader { return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); } + public QueryFilterStepIf hasWordFull(long termId) { + return reverseIndexFullReader.also(termId); + } + public QueryFilterStepIf hasWordPrio(long termId) { + return reverseIndexPriorityReader.also(termId); + } + /** Creates a query builder for terms in the priority index */ public IndexQueryBuilder findPriorityWord(long wordId) { diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 825728ae..33ca033e 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -1,9 +1,11 @@ package nu.marginalia.index.index; +import java.util.List; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { @@ -66,6 +68,20 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } + public IndexQueryBuilder addInclusionFilterAny(List filterSteps) { + if (filterSteps.isEmpty()) + return this; + + if (filterSteps.size() == 1) { + query.addInclusionFilter(filterSteps.getFirst()); + } + else { + query.addInclusionFilter(new QueryFilterAnyOf(filterSteps)); + } + + return this; + } + public IndexQuery build() { return query; } diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java new file mode 100644 index 00000000..a465bd86 --- /dev/null +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -0,0 +1,78 @@ +package nu.marginalia.index.index; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongSet; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +class QueryBranchWalker { + public final long[] priorityOrder; + public final List paths; + public final long termId; + + private QueryBranchWalker(long[] priorityOrder, List paths, long termId) { + this.priorityOrder = priorityOrder; + this.paths = paths; + this.termId = termId; + } + + public boolean atEnd() { + return priorityOrder.length == 0; + } + + public static List create(long[] priorityOrder, List paths) { + + List ret = new ArrayList<>(); + List remainingPaths = new LinkedList<>(paths); + + remainingPaths.removeIf(LongSet::isEmpty); + + for (int i = 0; i < priorityOrder.length; i++) { + long prio = priorityOrder[i]; + + var it = remainingPaths.iterator(); + List pathsForPrio = new ArrayList<>(); + + while (it.hasNext()) { + var path = it.next(); + + if (path.contains(prio)) { + path.remove(prio); + pathsForPrio.add(path); + it.remove(); + } + } + + if (!pathsForPrio.isEmpty()) { + LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size()); + + for (var p : priorityOrder) { + for (var path : pathsForPrio) { + if (path.contains(p)) { + remainingPrios.add(p); + break; + } + } + } + + ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio)); + } + } + + if (!remainingPaths.isEmpty()) { + System.out.println("Dropping: " + remainingPaths); + } + + return ret; + } + + public List next() { + if (atEnd()) + return List.of(); + + return create(priorityOrder, paths); + } + +} diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index a49e740e..0f55c0c8 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -2,6 +2,13 @@ package nu.marginalia.index.index; import com.google.inject.Inject; import com.google.inject.Singleton; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.index.query.filter.QueryFilterAllOf; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; +import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.DocMetadataList; import nu.marginalia.index.model.QueryParams; @@ -14,12 +21,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.LongFunction; +import java.util.function.Predicate; +import java.util.stream.Collectors; /** This class delegates SearchIndexReader and deals with the stateful nature of the index, * i.e. it may be possible to reconstruct the index and load a new set of data. @@ -105,6 +113,61 @@ public class StatefulIndex { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } + private Predicate containsOnly(long[] permitted) { + LongSet permittedTerms = new LongOpenHashSet(permitted); + return permittedTerms::containsAll; + } + + private List createBuilders(CompiledQueryLong query, + LongFunction builderFactory, + long[] termPriority) { + List paths = CompiledQueryAggregates.queriesAggregate(query); + + // Remove any paths that do not contain all prioritized terms, as this means + // the term is missing from the index and can never be found + paths.removeIf(containsOnly(termPriority).negate()); + + List helpers = QueryBranchWalker.create(termPriority, paths); + List builders = new ArrayList<>(); + + for (var helper : helpers) { + var builder = builderFactory.apply(helper.termId); + + builders.add(builder); + + if (helper.atEnd()) + continue; + + var filters = helper.next().stream() + .map(this::createFilter) + .toList(); + + builder.addInclusionFilterAny(filters); + } + + return builders; + } + + private QueryFilterStepIf createFilter(QueryBranchWalker helper) { + var selfCondition = combinedIndexReader.hasWordFull(helper.termId); + if (helper.atEnd()) + return selfCondition; + + var nextSteps = helper.next(); + var nextFilters = nextSteps.stream() + .map(this::createFilter) + .map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter))) + .collect(Collectors.toList()); + + if (nextFilters.isEmpty()) + return selfCondition; + + if (nextFilters.size() == 1) + return nextFilters.getFirst(); + + + return new QueryFilterAnyOf(nextFilters); + } public List createQueries(SearchTerms terms, QueryParams params) { @@ -117,40 +180,13 @@ public class StatefulIndex { final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); List queryHeads = new ArrayList<>(10); + + queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes)); + queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio)); + List queries = new ArrayList<>(10); - // To ensure that good results are discovered, create separate query heads for the priority index that - // filter for terms that contain pairs of two search terms - if (orderedIncludesPrio.length > 1) { - for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) { - for (int j = i + 1; j < orderedIncludesPrio.length; j++) { - var entrySource = combinedIndexReader - .findPriorityWord(orderedIncludesPrio[i]) - .alsoPrio(orderedIncludesPrio[j]); - queryHeads.add(entrySource); - } - } - } - - // Next consider entries that appear only once in the priority index - for (var wordId : orderedIncludesPrio) { - queryHeads.add(combinedIndexReader.findPriorityWord(wordId)); - } - - // Finally consider terms in the full index - queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0])); - for (var query : queryHeads) { - if (query == null) { - return Collections.emptyList(); - } - - // Note that we can add all includes as filters, even though - // they may not be present in the query head, as the query builder - // will ignore redundant include filters: - for (long orderedInclude : orderedIncludes) { - query = query.alsoFull(orderedInclude); - } for (long term : terms.excludes()) { query = query.notFull(term); @@ -161,6 +197,7 @@ public class StatefulIndex { queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); } + return queries; } diff --git a/code/index/java/nu/marginalia/index/model/SearchParameters.java b/code/index/java/nu/marginalia/index/model/SearchParameters.java index 7db25341..f0e851e5 100644 --- a/code/index/java/nu/marginalia/index/model/SearchParameters.java +++ b/code/index/java/nu/marginalia/index/model/SearchParameters.java @@ -2,16 +2,16 @@ package nu.marginalia.index.model; import nu.marginalia.api.searchquery.IndexProtobufCodec; import nu.marginalia.api.searchquery.RpcIndexQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.searchset.SearchSet; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit; public class SearchParameters { @@ -21,13 +21,16 @@ public class SearchParameters { */ public final int fetchSize; public final IndexSearchBudget budget; - public final List subqueries; + public final SearchQuery query; public final QueryParams queryParams; public final ResultRankingParameters rankingParams; public final int limitByDomain; public final int limitTotal; + public final CompiledQuery compiledQuery; + public final CompiledQueryLong compiledQueryIds; + // mutable: /** @@ -40,7 +43,7 @@ public class SearchParameters { this.fetchSize = limits.fetchSize(); this.budget = new IndexSearchBudget(limits.timeoutMs()); - this.subqueries = specsSet.subqueries; + this.query = specsSet.query; this.limitByDomain = limits.resultsByDomain(); this.limitTotal = limits.resultsTotal(); @@ -52,6 +55,9 @@ public class SearchParameters { searchSet, specsSet.queryStrategy); + compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); + compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); + rankingParams = specsSet.rankingParams; } @@ -63,11 +69,8 @@ public class SearchParameters { // The time budget is halved because this is the point when we start to // wrap up the search and return the results. this.budget = new IndexSearchBudget(limits.timeoutMs() / 2); + this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery()); - this.subqueries = new ArrayList<>(request.getSubqueriesCount()); - for (int i = 0; i < request.getSubqueriesCount(); i++) { - this.subqueries.add(IndexProtobufCodec.convertSearchSubquery(request.getSubqueries(i))); - } this.limitByDomain = limits.resultsByDomain(); this.limitTotal = limits.resultsTotal(); @@ -79,9 +82,13 @@ public class SearchParameters { searchSet, QueryStrategy.valueOf(request.getQueryStrategy())); + compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); + compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); + rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters()); } + public long getDataCost() { return dataCost; } diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index c32b1aa3..307e4179 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -4,7 +4,8 @@ import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import java.util.ArrayList; import java.util.List; @@ -18,34 +19,39 @@ public final class SearchTerms { private final LongList priority; private final List coherences; + private final CompiledQueryLong compiledQueryIds; + public SearchTerms( LongList includes, LongList excludes, LongList priority, - List coherences + List coherences, + CompiledQueryLong compiledQueryIds ) { this.includes = includes; this.excludes = excludes; this.priority = priority; this.coherences = coherences; + this.compiledQueryIds = compiledQueryIds; } - public SearchTerms(SearchSubquery subquery) { + public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) { this(new LongArrayList(), new LongArrayList(), new LongArrayList(), - new ArrayList<>()); + new ArrayList<>(), + compiledQueryIds); - for (var word : subquery.searchTermsInclude) { + for (var word : query.searchTermsInclude) { includes.add(getWordId(word)); } - for (var word : subquery.searchTermsAdvice) { + for (var word : query.searchTermsAdvice) { // This looks like a bug, but it's not includes.add(getWordId(word)); } - for (var coherence : subquery.searchTermCoherences) { + for (var coherence : query.searchTermCoherences) { LongList parts = new LongArrayList(coherence.size()); for (var word : coherence) { @@ -55,10 +61,10 @@ public final class SearchTerms { coherences.add(parts); } - for (var word : subquery.searchTermsExclude) { + for (var word : query.searchTermsExclude) { excludes.add(getWordId(word)); } - for (var word : subquery.searchTermsPriority) { + for (var word : query.searchTermsPriority) { priority.add(getWordId(word)); } } @@ -96,6 +102,8 @@ public final class SearchTerms { return coherences; } + public CompiledQueryLong compiledQuery() { return compiledQueryIds; } + @Override public boolean equals(Object obj) { if (obj == this) return true; diff --git a/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java b/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java index 9797ca95..fa516565 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java +++ b/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java @@ -1,29 +1,9 @@ package nu.marginalia.index.model; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.hash.MurmurHash3_128; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - public class SearchTermsUtil { - /** Extract all include-terms from the specified subqueries, - * and a return a map of the terms and their termIds. - */ - public static Map getAllIncludeTerms(List subqueries) { - Map ret = new HashMap<>(); - - for (var subquery : subqueries) { - for (var include : subquery.searchTermsInclude) { - ret.computeIfAbsent(include, i -> getWordId(include)); - } - } - - return ret; - } - private static final MurmurHash3_128 hasher = new MurmurHash3_128(); /** Translate the word to a unique id. */ diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index 1932a5a4..977a87e7 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -4,7 +4,8 @@ import com.google.inject.Inject; import gnu.trove.map.hash.TObjectLongHashMap; import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; import it.unimi.dsi.fastutil.longs.LongArrayList; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.results.model.QuerySearchTerms; @@ -13,9 +14,6 @@ import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.TermIdList; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup; import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; @@ -42,43 +40,24 @@ public class IndexMetadataService { return new TermMetadataForCombinedDocumentIds(termdocToMeta); } - public QuerySearchTerms getSearchTerms(List searchTermVariants) { + public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { LongArrayList termIdsList = new LongArrayList(); TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); - for (var subquery : searchTermVariants) { - for (var term : subquery.searchTermsInclude) { - if (termToId.containsKey(term)) { - continue; - } - - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termToId.put(term, id); - } + for (String word : compiledQuery) { + long id = SearchTermsUtil.getWordId(word); + termIdsList.add(id); + termToId.put(word, id); } return new QuerySearchTerms(termToId, new TermIdList(termIdsList), - getTermCoherences(searchTermVariants)); - } - - - private TermCoherenceGroupList getTermCoherences(List searchTermVariants) { - List coherences = new ArrayList<>(); - - for (var subquery : searchTermVariants) { - for (var coh : subquery.searchTermCoherences) { - coherences.add(new TermCoherenceGroup(coh)); - } - - // It's assumed each subquery has identical coherences - break; - } - - return new TermCoherenceGroupList(coherences); + new TermCoherenceGroupList( + searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList() + ) + ); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 967a600f..3777cf4f 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,10 +1,13 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.results.model.QuerySearchTerms; @@ -23,7 +26,6 @@ import java.util.List; * reasons to cache this data, and performs the calculations */ public class IndexResultValuationContext { private final StatefulIndex statefulIndex; - private final List> searchTermVariants; private final QueryParams queryParams; private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; @@ -31,23 +33,26 @@ public class IndexResultValuationContext { private final ResultRankingContext rankingContext; private final ResultValuator searchResultValuator; + private final CompiledQuery compiledQuery; + private final CompiledQueryLong compiledQueryIds; public IndexResultValuationContext(IndexMetadataService metadataService, ResultValuator searchResultValuator, CombinedDocIdList ids, StatefulIndex statefulIndex, ResultRankingContext rankingContext, - List subqueries, - QueryParams queryParams + SearchParameters params ) { this.statefulIndex = statefulIndex; this.rankingContext = rankingContext; this.searchResultValuator = searchResultValuator; - this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); - this.queryParams = queryParams; + this.queryParams = params.queryParams; + this.compiledQuery = params.compiledQuery; + this.compiledQueryIds = params.compiledQueryIds; + + this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - this.searchTerms = metadataService.getSearchTerms(subqueries); this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); } @@ -65,68 +70,39 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - int maxFlagsCount = 0; - boolean anyAllSynthetic = false; - int maxPositionsSet = 0; + SearchResultItem searchResult = new SearchResultItem(docId); - SearchResultItem searchResult = new SearchResultItem(docId, - searchTermVariants.stream().mapToInt(List::size).sum()); + SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> + new SearchResultKeywordScore( + compiledQuery.at(idx), + compiledQueryIds.at(idx), + termMetadataForCombinedDocumentIds.getTermMetadata( + compiledQueryIds.at(idx), combinedId + ), + docMetadata, + htmlFeatures) + ) + .toArray(SearchResultKeywordScore[]::new); - for (int querySetId = 0; - querySetId < searchTermVariants.size(); - querySetId++) - { - var termList = searchTermVariants.get(querySetId); + // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs + // to be able to re-construct its own CompiledQuery for re-ranking the results. This is + // a very flimsy assumption. + searchResult.keywordScores.addAll(List.of(scores)); - SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()]; + CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); - boolean synthetic = true; + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); - for (int termIdx = 0; termIdx < termList.size(); termIdx++) { - String searchTerm = termList.get(termIdx); - - long termMetadata = termMetadataForCombinedDocumentIds.getTermMetadata( - searchTerms.getIdForTerm(searchTerm), - combinedId - ); - - var score = new SearchResultKeywordScore( - querySetId, - searchTerm, - termMetadata, - docMetadata, - htmlFeatures - ); - - synthetic &= WordFlags.Synthetic.isPresent(termMetadata); - - searchResult.keywordScores.add(score); - - termScoresForSet[termIdx] = score; - } - - if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) { - continue; - } - - int minFlagsCount = 8; - int minPositionsSet = 4; - - for (var termScore : termScoresForSet) { - final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask); - minFlagsCount = Math.min(minFlagsCount, flagCount); - minPositionsSet = Math.min(minPositionsSet, termScore.positionCount()); - } - - maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount); - maxPositionsSet = Math.max(maxPositionsSet, minPositionsSet); - anyAllSynthetic |= synthetic; + if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + return null; } - if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) + if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, + double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -135,20 +111,17 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + QueryStrategy queryStrategy) + { if (queryStrategy == QueryStrategy.AUTO || queryStrategy == QueryStrategy.SENTENCE || queryStrategy == QueryStrategy.TOPIC) { return true; } - for (var keyword : termSet) { - if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) { - return false; - } - } - - return true; + return CompiledQueryAggregates.booleanAggregate(queryGraphScores, + docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index 51e59c63..f1dabea4 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -4,10 +4,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; -import it.unimi.dsi.fastutil.longs.LongArrayList; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; @@ -19,8 +20,6 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; -import java.util.function.Consumer; -import java.util.stream.Collectors; @Singleton public class IndexResultValuatorService { @@ -44,8 +43,8 @@ public class IndexResultValuatorService { } public List rankResults(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) + ResultRankingContext rankingContext, + CombinedDocIdList resultIds) { final var evaluator = createValuationContext(params, rankingContext, resultIds); @@ -70,8 +69,7 @@ public class IndexResultValuatorService { resultIds, statefulIndex, rankingContext, - params.subqueries, - params.queryParams); + params); } @@ -96,12 +94,13 @@ public class IndexResultValuatorService { item.resultsFromDomain = domainCountFilter.getCount(item); } - return decorateAndRerank(resultsList, rankingContext); + return decorateAndRerank(resultsList, params.compiledQuery, rankingContext); } /** Decorate the result items with additional information from the link database * and calculate an updated ranking with the additional information */ public List decorateAndRerank(List rawResults, + CompiledQuery compiledQuery, ResultRankingContext rankingContext) throws SQLException { @@ -125,13 +124,22 @@ public class IndexResultValuatorService { continue; } - resultItems.add(createCombinedItem(result, docData, rankingContext)); + // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // + // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same + // order as the data for the CompiledQuery. + CompiledQuery resultQuery = + new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + + + resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, + CompiledQuery resultQuery, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -144,7 +152,7 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - resultValuator.calculateSearchResultValue(result.keywordScores, docData.wordsTotal(), rankingContext) + resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) ); } diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 6c67559d..05ff83d2 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -33,14 +34,17 @@ public class ResultValuator { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(List scores, + public double calculateSearchResultValue(CompiledQuery scores, int length, ResultRankingContext ctx) { - int sets = numberOfSets(scores); + if (scores.size() == 0) + return Double.MAX_VALUE; + if (length < 0) + length = 5000; - long documentMetadata = documentMetadata(scores); - int features = htmlFeatures(scores); + long documentMetadata = scores.at(0).encodedDocMetadata(); + int features = scores.at(0).htmlFeatures(); var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -75,32 +79,16 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = 0; - double bestBM25F = 0; - double bestBM25P = 0; - double bestBM25PN = 0; - - for (int set = 0; set < sets; set++) { - ResultKeywordSet keywordSet = createKeywordSet(scores, set); - - if (keywordSet.isEmpty()) - continue; - - bestTcf = Math.max(bestTcf, rankingParams.tcfWeight * termCoherenceFactor.calculate(keywordSet)); - bestBM25P = Math.max(bestBM25P, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx)); - bestBM25F = Math.max(bestBM25F, rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx)); - if (keywordSet.hasNgram()) { - bestBM25PN = Math.max(bestBM25PN, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx)); - } - } - + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); + double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); + double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java index 335b5fa8..bc13671e 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java @@ -1,10 +1,11 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.ranking.results.ResultKeywordSet; public class Bm25Factor { private static final int AVG_LENGTH = 5000; @@ -13,43 +14,33 @@ public class Bm25Factor { * * @see Bm25Parameters */ - public double calculateBm25(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, int length, ResultRankingContext ctx) { + public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { final int docCount = ctx.termFreqDocCount(); - if (length <= 0) - length = AVG_LENGTH; - - double sum = 0.; - - for (var keyword : keywordSet.keywords()) { + return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { double count = keyword.positionCount(); int freq = ctx.frequency(keyword.keyword); - sum += invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - } - - return sum; + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + }); } /** Bm25 calculation, except instead of counting positions in the document, * the number of relevance signals for the term is counted instead. */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, ResultRankingContext ctx) { + public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { final int docCount = ctx.termFreqDocCount(); - double sum = 0.; - - for (var keyword : keywordSet.keywords()) { + return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { double count = evaluatePriorityScore(keyword); int freq = ctx.priorityFrequency(keyword.keyword); // note we override b to zero for priority terms as they are independent of document length - sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - } + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + }); - return sum; } private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index f956ce88..71159c58 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,14 +1,16 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultKeywordSet; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(ResultKeywordSet keywordSet) { - long mask = combinedMask(keywordSet); + public double calculate(CompiledQuery scores) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); return bitsSetFactor(mask); } @@ -19,14 +21,5 @@ public class TermCoherenceFactor { return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25); } - long combinedMask(ResultKeywordSet keywordSet) { - long mask = WordMetadata.POSITIONS_MASK; - - for (var keyword : keywordSet.keywords()) { - mask &= keyword.positions(); - } - - return mask; - } } \ No newline at end of file diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java index 68a88625..74ebdea1 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -2,6 +2,8 @@ package nu.marginalia.index.query; import nu.marginalia.index.query.filter.QueryFilterStepIf; +import java.util.List; + /** Builds a query. *

    * Note: The query builder may omit predicates that are deemed redundant. @@ -21,6 +23,7 @@ public interface IndexQueryBuilder { IndexQueryBuilder notFull(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); + IndexQueryBuilder addInclusionFilterAny(List filterStep); IndexQuery build(); } diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java new file mode 100644 index 00000000..8c20fe98 --- /dev/null +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java @@ -0,0 +1,57 @@ +package nu.marginalia.index.query.filter; + +import nu.marginalia.array.buffer.LongQueryBuffer; + +import java.util.List; +import java.util.StringJoiner; + +public class QueryFilterAllOf implements QueryFilterStepIf { + private final List steps; + + public QueryFilterAllOf(List steps) { + this.steps = steps; + } + + public double cost() { + double prod = 1.; + + for (var step : steps) { + double cost = step.cost(); + if (cost > 1.0) { + prod *= Math.log(cost); + } + else { + prod += cost; + } + } + + return prod; + } + + @Override + public boolean test(long value) { + for (var step : steps) { + if (!step.test(value)) + return false; + } + return true; + } + + + public void apply(LongQueryBuffer buffer) { + if (steps.isEmpty()) + return; + + for (var step : steps) { + step.apply(buffer); + } + } + + public String describe() { + StringJoiner sj = new StringJoiner(",", "[All Of: ", "]"); + for (var step : steps) { + sj.add(step.describe()); + } + return sj.toString(); + } +} diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java index c9ee2c6e..2d177645 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java @@ -2,7 +2,6 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; -import java.util.Arrays; import java.util.List; import java.util.StringJoiner; @@ -14,7 +13,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { } public double cost() { - return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.); + return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum(); } @Override @@ -31,31 +30,23 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { if (steps.isEmpty()) return; - int start; + int start = 0; int end = buffer.end; - steps.getFirst().apply(buffer); - - // The filter functions will partition the data in the buffer from 0 to END, - // and update END to the length of the retained items, keeping the retained - // items sorted but making no guarantees about the rejected half - // - // Therefore, we need to re-sort the rejected side, and to satisfy the - // constraint that the data is sorted up to END, finally sort it again. - // - // This sorting may seem like it's slower, but filter.apply(...) is - // typically much faster than iterating over filter.test(...); so this - // is more than made up for - - for (int fi = 1; fi < steps.size(); fi++) + for (var step : steps) { - start = buffer.end; - Arrays.sort(buffer.data, start, end); - buffer.startFilterForRange(start, end); - steps.get(fi).apply(buffer); + var slice = buffer.slice(start, end); + slice.data.quickSort(0, slice.size()); + + step.apply(slice); + start += slice.end; } - Arrays.sort(buffer.data, 0, buffer.end); + buffer.data.quickSort(0, start); + + // Special finalization + buffer.reset(); + buffer.end = start; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java index ed02dd6d..77f503cf 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java @@ -16,7 +16,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf { } public double cost() { - return 0.; + return 1.; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java index 1bcd04ae..502e7c4c 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java @@ -15,7 +15,7 @@ public class QueryFilterNoPass implements QueryFilterStepIf { } public double cost() { - return 0.; + return 1.; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java index 92c8c972..0d715863 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java @@ -16,7 +16,7 @@ public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf { @Override public double cost() { - return 0; + return 1; } @Override diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java index 56f08b71..9cd51d7a 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java @@ -16,7 +16,7 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf { @Override public double cost() { - return 0; + return 1; } @Override diff --git a/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java b/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java index a7450b11..b2ef1bdb 100644 --- a/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java +++ b/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java @@ -55,6 +55,32 @@ class QueryFilterStepIfTest { assertArrayEquals(new long[]{8, 10}, buffer.copyData()); } + @Test + public void testSuccessiveApplicationWithAllOf() { + var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0); + var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6); + new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer); + assertArrayEquals(new long[]{8, 10}, buffer.copyData()); + } + @Test + public void testCombinedOrAnd() { + var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + + var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0); + var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5); + var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2)); + + var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1); + var filter4 = new QueryFilterStepFromPredicate(value -> value > 5); + var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4)); + + var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4)); + + filter12_34.apply(buffer); + + assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData()); + } @Test public void testCombinedApplication() { var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 634481f4..301b5e19 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.process.control.FakeProcessHeartbeat; @@ -123,9 +123,10 @@ public class IndexQueryServiceIntegrationSmokeTest { .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) .searchSetIdentifier("NONE") - .subqueries(List.of(new SearchSubquery( + .query(new SearchQuery( + "2 3 5", List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + Collections.emptyList())).build()); int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); @@ -166,9 +167,13 @@ public class IndexQueryServiceIntegrationSmokeTest { .rankingParams(ResultRankingParameters.sensibleDefaults()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) - .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + .query(new SearchQuery( + "2 3 5", + List.of("3", "5", "2"), + List.of("4"), + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList())).build()); int[] idxes = new int[] { 210, 270 }; long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray(); @@ -202,9 +207,8 @@ public class IndexQueryServiceIntegrationSmokeTest { .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier("NONE") .rankingParams(ResultRankingParameters.sensibleDefaults()) - .subqueries(List.of(new SearchSubquery( - List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList())) + .query( + new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()) ).build()); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 6def5bbc..e29f8751 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -4,7 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.storage.FileStorageService; @@ -35,6 +35,7 @@ import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import org.apache.logging.log4j.util.Strings; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -108,7 +109,7 @@ public class IndexQueryServiceIntegrationTest { w("world", WordFlags.Title) ).load(); - var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); executeSearch(query) .expectDocumentsInOrder(d(1,1)); @@ -127,57 +128,51 @@ public class IndexQueryServiceIntegrationTest { ).load(); var queryMissingExclude = basicQuery(builder -> - builder.subqueries(includeAndExclude("hello", "missing"))); + builder.query(includeAndExclude("hello", "missing"))); executeSearch(queryMissingExclude) .expectDocumentsInOrder(d(1,1)); var queryMissingInclude = basicQuery(builder -> - builder.subqueries(justInclude("missing"))); + builder.query(justInclude("missing"))); executeSearch(queryMissingInclude) .expectCount(0); var queryMissingPriority = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of(), - List.of("missing"), - List.of() - ) - ))); + builder.query(new SearchQuery( + "hello", + List.of("hello"), + List.of(), + List.of(), + List.of("missing"), + List.of()) + )); executeSearch(queryMissingPriority) .expectCount(1); var queryMissingAdvice = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of("missing"), - List.of(), - List.of() - ) + builder.query( + new SearchQuery("hello", + List.of("hello"), + List.of(), + List.of("missing"), + List.of(), + List.of() ))); executeSearch(queryMissingAdvice) .expectCount(0); var queryMissingCoherence = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of(), - List.of(), - List.of(List.of("missing", "hello")) - ) + builder.query( + new SearchQuery("hello", + List.of("hello"), + List.of(), + List.of(), + List.of(), + List.of(List.of("missing", "hello")) ))); executeSearch(queryMissingCoherence) @@ -202,7 +197,7 @@ public class IndexQueryServiceIntegrationTest { ).load(); - var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); executeSearch(query) .expectDocumentsInOrder(d(1,1)); @@ -234,15 +229,15 @@ public class IndexQueryServiceIntegrationTest { var beforeY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.lessThan(2000)) ); var atY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.equals(2000)) ); var afterY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.greaterThan(2000)) ); @@ -296,11 +291,11 @@ public class IndexQueryServiceIntegrationTest { var domain1 = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .domains(List.of(1)) ); var domain2 = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .domains(List.of(2)) ); @@ -334,7 +329,7 @@ public class IndexQueryServiceIntegrationTest { ).load(); var query = basicQuery(builder -> - builder.subqueries(includeAndExclude("hello", "my_darling")) + builder.query(includeAndExclude("hello", "my_darling")) ); executeSearch(query) @@ -403,7 +398,7 @@ public class IndexQueryServiceIntegrationTest { .load(); var rsp = queryService.justQuery( - basicQuery(builder -> builder.subqueries( + basicQuery(builder -> builder.query( // note coherence requriement includeAndCohere("hello", "world") ))); @@ -424,50 +419,53 @@ public class IndexQueryServiceIntegrationTest { .rank(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) - .searchSetIdentifier("NONE") - .subqueries(List.of()); + .searchSetIdentifier("NONE"); return mutator.apply(builder).build(); } - List justInclude(String... includes) { - return List.of(new SearchSubquery( + SearchQuery justInclude(String... includes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), List.of(includes), List.of(), List.of(), List.of(), List.of() - )); + ); } - List includeAndExclude(List includes, List excludes) { - return List.of(new SearchSubquery( + SearchQuery includeAndExclude(List includes, List excludes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), includes, excludes, List.of(), List.of(), List.of() - )); + ); } - List includeAndExclude(String include, String exclude) { - return List.of(new SearchSubquery( + SearchQuery includeAndExclude(String include, String exclude) { + return new SearchQuery( + include, List.of(include), List.of(exclude), List.of(), List.of(), List.of() - )); + ); } - List includeAndCohere(String... includes) { - return List.of(new SearchSubquery( + SearchQuery includeAndCohere(String... includes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), List.of(includes), List.of(), List.of(), List.of(), List.of(List.of(includes)) - )); + ); } private MockDataDocument d(int domainId, int ordinal) { return new MockDataDocument(domainId, ordinal); diff --git a/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java new file mode 100644 index 00000000..8d2f45c8 --- /dev/null +++ b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java @@ -0,0 +1,59 @@ +package nu.marginalia.index.index; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongSet; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +class QueryBranchWalkerTest { + @Test + public void testNoOverlap() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2 }, + List.of(set(1), set(2)) + ); + assertEquals(2, paths.size()); + assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + } + + @Test + public void testCond() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2, 3, 4 }, + List.of(set(1,2,3), set(1,4,3)) + ); + assertEquals(1, paths.size()); + assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + System.out.println(Arrays.toString(paths.getFirst().priorityOrder)); + assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder); + + var next = paths.getFirst().next(); + assertEquals(2, next.size()); + assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet())); + Map byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w)); + assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder ); + assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder ); + } + + @Test + public void testNoOverlapFirst() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2, 3 }, + List.of(set(1, 2), set(1, 3)) + ); + assertEquals(1, paths.size()); + assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder); + assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + } + + LongSet set(long... args) { + return new LongArraySet(args); + } +} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 4f5a12cd..948c5857 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -2,9 +2,10 @@ package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.model.id.UrlIdCodec; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.junit.jupiter.api.Assertions.*; class IndexResultDomainDeduplicatorTest { @@ -24,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 4); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 8f8f7eaa..243ae90d 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -35,21 +36,21 @@ class ResultValuatorTest { ); } - List titleOnlyLowCountSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) ); - List highCountNoTitleSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) ); - List highCountSubjectSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) @@ -75,7 +76,10 @@ class ResultValuatorTest { System.out.println(highCountSubject); } - private long docMetadata(int topology, int year, int quality, EnumSet flags) { + private long docMetadata(int topology, + int year, + int quality, + EnumSet flags) { return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode(); } diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index a5bca54e..028896d9 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -1,9 +1,10 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultKeywordSet; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -20,7 +21,7 @@ class TermCoherenceFactorTest { WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK ); - long mask = termCoherenceFactor.combinedMask(allPositionsSet); + long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); @@ -33,7 +34,7 @@ class TermCoherenceFactorTest { 0, 0 ); - long mask = termCoherenceFactor.combinedMask(allPositionsSet); + long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); @@ -46,7 +47,7 @@ class TermCoherenceFactorTest { List.of(0, 1, 2, 3), List.of(0, 1, 2, 3) ); - long mask = termCoherenceFactor.combinedMask(positions); + long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); printMask(mask); } @@ -57,7 +58,7 @@ class TermCoherenceFactorTest { List.of(55, 54, 53, 52), List.of(55, 54, 53, 52) ); - long mask = termCoherenceFactor.combinedMask(positions); + long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); printMask(mask); } @@ -72,7 +73,7 @@ class TermCoherenceFactorTest { System.out.println(BrailleBlockPunchCards.printBits(mask, 48)); } - ResultKeywordSet createSet(List... maskPositions) { + CompiledQuery createSet(List... maskPositions) { long[] positions = new long[maskPositions.length]; for (int i = 0; i < maskPositions.length; i++) { @@ -84,14 +85,14 @@ class TermCoherenceFactorTest { return createSet(positions); } - ResultKeywordSet createSet(long... positionMasks) { + CompiledQuery createSet(long... positionMasks) { List keywords = new ArrayList<>(); for (int i = 0; i < positionMasks.length; i++) { - keywords.add(new SearchResultKeywordScore(0, "", + keywords.add(new SearchResultKeywordScore("", 0, new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); } - return new ResultKeywordSet(keywords); + return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); } } \ No newline at end of file diff --git a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java index 39d9bff7..ab7f18bd 100644 --- a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java +++ b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java @@ -1,5 +1,7 @@ package nu.marginalia.array.algo; +import nu.marginalia.array.LongArray; + import java.io.IOException; import java.nio.LongBuffer; import java.nio.channels.FileChannel; @@ -61,6 +63,12 @@ public interface LongArrayBase extends BulkTransferArray { } } + default void get(long start, long end, LongArray buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + buffer.set(i + bufferStart, get(start + i)); + } + } + default void get(long start, LongBuffer buffer) { get(start, start + buffer.remaining(), buffer, buffer.position()); } diff --git a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java index 390325ee..d5b44389 100644 --- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -1,5 +1,8 @@ package nu.marginalia.array.buffer; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; + import java.util.Arrays; /** A buffer for long values that can be used to filter and manipulate the data. @@ -17,7 +20,7 @@ import java.util.Arrays; public class LongQueryBuffer { /** Direct access to the data in the buffer, * guaranteed to be populated until `end` */ - public final long[] data; + public final LongArray data; /** Number of items in the data buffer */ public int end; @@ -25,18 +28,27 @@ public class LongQueryBuffer { private int read = 0; private int write = 0; + private LongQueryBuffer(LongArray array, int size) { + this.data = array; + this.end = size; + } + public LongQueryBuffer(int size) { - this.data = new long[size]; + this.data = LongArrayFactory.onHeapConfined(size); this.end = size; } public LongQueryBuffer(long[] data, int size) { - this.data = data; + this.data = LongArrayFactory.onHeapConfined(size); + this.data.set(0, data); + this.end = size; } public long[] copyData() { - return Arrays.copyOf(data, end); + long[] copy = new long[end]; + data.forEach(0, end, (pos, val) -> copy[(int)pos]=val ); + return copy; } public boolean isEmpty() { @@ -48,7 +60,7 @@ public class LongQueryBuffer { } public void reset() { - end = data.length; + end = (int) data.size(); read = 0; write = 0; } @@ -59,12 +71,16 @@ public class LongQueryBuffer { write = 0; } + public LongQueryBuffer slice(int start, int end) { + return new LongQueryBuffer(data.range(start, end), end - start); + } + /* == Filtering methods == */ /** Returns the current value at the read pointer. */ public long currentValue() { - return data[read]; + return data.get(read); } /** Advances the read pointer and returns true if there are more values to read. */ @@ -79,9 +95,9 @@ public class LongQueryBuffer { */ public boolean retainAndAdvance() { if (read != write) { - long tmp = data[write]; - data[write] = data[read]; - data[read] = tmp; + long tmp = data.get(write); + data.set(write, data.get(read)); + data.set(read, tmp); } write++; @@ -117,9 +133,10 @@ public class LongQueryBuffer { write = 0; } - public void startFilterForRange(int pos, int end) { - read = write = pos; - this.end = end; + public void finalizeFiltering(int pos) { + end = write; + read = pos; + write = pos; } /** Retain only unique values in the buffer, and update the end pointer to the new length. @@ -153,7 +170,7 @@ public class LongQueryBuffer { "read = " + read + ",write = " + write + ",end = " + end + - ",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]"; + ",data = [" + Arrays.toString(copyData()) + "]]"; } diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java index a515917b..fa50045e 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java @@ -143,7 +143,7 @@ class LongArraySearchTest { assertEquals(43, buffer.size()); for (int i = 0; i < 43; i++) { - assertEquals(buffer.data[i], i*3); + assertEquals(buffer.data.get(i), i*3); } } @@ -160,7 +160,7 @@ class LongArraySearchTest { int j = 0; for (int i = 0; i < 43; i++) { if (++j % 3 == 0) j++; - assertEquals(buffer.data[i], j); + assertEquals(buffer.data.get(i), j); } } } \ No newline at end of file diff --git a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java index 048e0301..bc40bb43 100644 --- a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java +++ b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java @@ -109,8 +109,8 @@ public class BTreeReader { return ip.findData(key); } - public void readData(long[] buf, int n, long pos) { - data.get(pos, pos + n, buf); + public void readData(LongArray buf, int n, long pos) { + data.get(pos, pos + n, buf, 0); } /** Used for querying interlaced data in the btree. diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java index 8b65753d..be24de10 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java @@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithIndexTest { @Test public void testRetain() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.retainEntries(odds); @@ -46,7 +47,8 @@ public class BTreeReaderRejectRetainWithIndexTest { @Test public void testReject() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.rejectEntries(odds); diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java index e5d4dc79..fc3b71df 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java @@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithoutIndexTest { @Test public void testRetain() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.retainEntries(odds); @@ -46,7 +47,9 @@ public class BTreeReaderRejectRetainWithoutIndexTest { @Test public void testReject() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); + BTreeReader reader = new BTreeReader(array, ctx, 0); reader.rejectEntries(odds); diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java index 15c8567e..cc28b209 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -1,7 +1,7 @@ package nu.marginalia.search; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; @@ -14,7 +14,7 @@ import java.util.List; public class SearchQueryParamFactory { public QueryParams forRegularSearch(SearchParameters userParams) { - SearchSubquery prototype = new SearchSubquery(); + SearchQuery prototype = new SearchQuery(); var profile = userParams.profile(); profile.addTacitTerms(prototype); diff --git a/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java b/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java index 9e8383f3..ce3bf099 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java +++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java @@ -1,6 +1,6 @@ package nu.marginalia.search.command; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import javax.annotation.Nullable; import java.util.Arrays; @@ -23,7 +23,7 @@ public enum SearchAdtechParameter { return DEFAULT; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); } } diff --git a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java index 6c8634ac..8cf6aada 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java +++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java @@ -1,6 +1,6 @@ package nu.marginalia.search.command; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import javax.annotation.Nullable; import java.util.Arrays; @@ -25,7 +25,7 @@ public enum SearchJsParameter { return DEFAULT; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); } } diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java index 27d9f4aa..955c3fcb 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java @@ -2,7 +2,7 @@ package nu.marginalia.search.model; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import java.util.Objects; @@ -47,7 +47,7 @@ public enum SearchProfile { return NO_FILTER; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { if (this == ACADEMIA) { subquery.searchTermsAdvice.add("special:academia"); } From ae7c760772ab256ae1e13d30f626e9f954d4df0e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Apr 2024 13:30:49 +0200 Subject: [PATCH 21/47] (index) Clean up new index query code --- .../model/compiled/CompiledQueryLong.java | 8 + .../model/compiled/CqDataLong.java | 4 + .../aggregate/CompiledQueryAggregates.java | 1 + .../index/index/QueryBranchWalker.java | 74 ++++++--- .../marginalia/index/index/StatefulIndex.java | 153 +++++++++--------- .../marginalia/index/model/SearchTerms.java | 81 ++-------- .../index/query/filter/QueryFilterAllOf.java | 18 ++- .../index/query/filter/QueryFilterAnyOf.java | 41 ++++- .../array/buffer/LongQueryBuffer.java | 6 - 9 files changed, 208 insertions(+), 178 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java index 639778dc..94fa0e8b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -39,4 +39,12 @@ public class CompiledQueryLong implements Iterable { public Iterator iterator() { return stream().iterator(); } + + public long[] copyData() { + return data.copyData(); + } + + public boolean isEmpty() { + return data.size() == 0; + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java index 8049631e..24f76b13 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java @@ -24,4 +24,8 @@ public class CqDataLong { public int size() { return data.length; } + + public long[] copyData() { + return Arrays.copyOf(data, data.length); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 209acbee..9c4abe72 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -8,6 +8,7 @@ import java.util.ArrayList; import java.util.List; import java.util.function.*; +/** Contains methods for aggregating across a CompiledQuery or CompiledQueryLong */ public class CompiledQueryAggregates { /** Compiled query aggregate that for a single boolean that treats or-branches as logical OR, * and and-branches as logical AND operations. Will return true if there exists a path through diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java index a465bd86..34b04f0a 100644 --- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -1,13 +1,18 @@ package nu.marginalia.index.index; import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; -class QueryBranchWalker { +/** Helper class for index query construction */ +public class QueryBranchWalker { + private static final Logger logger = LoggerFactory.getLogger(QueryBranchWalker.class); public final long[] priorityOrder; public final List paths; public final long termId; @@ -22,56 +27,81 @@ class QueryBranchWalker { return priorityOrder.length == 0; } + /** Group the provided paths by the lowest termId they contain per the provided priorityOrder, + * into a list of QueryBranchWalkers. This can be performed iteratively on the resultant QBW:s + * to traverse the tree via the next() method. + *

    + * The paths can be extracted through the {@link nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates CompiledQueryAggregates} + * queriesAggregate method. + */ public static List create(long[] priorityOrder, List paths) { + if (paths.isEmpty()) + return List.of(); List ret = new ArrayList<>(); List remainingPaths = new LinkedList<>(paths); - remainingPaths.removeIf(LongSet::isEmpty); + List pathsForPrio = new ArrayList<>(); + for (int i = 0; i < priorityOrder.length; i++) { - long prio = priorityOrder[i]; + long termId = priorityOrder[i]; var it = remainingPaths.iterator(); - List pathsForPrio = new ArrayList<>(); while (it.hasNext()) { var path = it.next(); - if (path.contains(prio)) { - path.remove(prio); + if (path.contains(termId)) { + // Remove the current termId from the path + path.remove(termId); + + // Add it to the set of paths associated with the termId pathsForPrio.add(path); + + // Remove it from consideration it.remove(); } } if (!pathsForPrio.isEmpty()) { - LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size()); - - for (var p : priorityOrder) { - for (var path : pathsForPrio) { - if (path.contains(p)) { - remainingPrios.add(p); - break; - } - } - } - - ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio)); + long[] newPrios = keepRelevantPriorities(priorityOrder, pathsForPrio); + ret.add(new QueryBranchWalker(newPrios, new ArrayList<>(pathsForPrio), termId)); + pathsForPrio.clear(); } } + // This happens if the priorityOrder array doesn't contain all items in the paths, + // in practice only when an index doesn't contain all the search terms, so we can just + // skip those paths if (!remainingPaths.isEmpty()) { - System.out.println("Dropping: " + remainingPaths); + logger.info("Dropping: {}", remainingPaths); } return ret; } - public List next() { - if (atEnd()) - return List.of(); + /** From the provided priorityOrder array, keep the elements that are present in any set in paths */ + private static long[] keepRelevantPriorities(long[] priorityOrder, List paths) { + LongArrayList remainingPrios = new LongArrayList(paths.size()); + // these sets are typically very small so array set is a good choice + LongSet allElements = new LongArraySet(priorityOrder.length); + for (var path : paths) { + allElements.addAll(path); + } + + for (var p : priorityOrder) { + if (allElements.contains(p)) + remainingPrios.add(p); + } + + return remainingPrios.elements(); + } + + /** Convenience method that applies the create() method + * to the priority order and paths associated with this instance */ + public List next() { return create(priorityOrder, paths); } diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 0f55c0c8..273da2d0 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.index.query.filter.QueryFilterAllOf; import nu.marginalia.index.query.filter.QueryFilterAnyOf; @@ -25,9 +24,7 @@ import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.function.LongFunction; import java.util.function.Predicate; -import java.util.stream.Collectors; /** This class delegates SearchIndexReader and deals with the stateful nature of the index, * i.e. it may be possible to reconstruct the index and load a new set of data. @@ -95,7 +92,6 @@ public class StatefulIndex { logger.error("Uncaught exception", ex); } finally { - lock.unlock(); } @@ -113,62 +109,6 @@ public class StatefulIndex { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } - private Predicate containsOnly(long[] permitted) { - LongSet permittedTerms = new LongOpenHashSet(permitted); - return permittedTerms::containsAll; - } - - private List createBuilders(CompiledQueryLong query, - LongFunction builderFactory, - long[] termPriority) { - List paths = CompiledQueryAggregates.queriesAggregate(query); - - // Remove any paths that do not contain all prioritized terms, as this means - // the term is missing from the index and can never be found - paths.removeIf(containsOnly(termPriority).negate()); - - List helpers = QueryBranchWalker.create(termPriority, paths); - List builders = new ArrayList<>(); - - for (var helper : helpers) { - var builder = builderFactory.apply(helper.termId); - - builders.add(builder); - - if (helper.atEnd()) - continue; - - var filters = helper.next().stream() - .map(this::createFilter) - .toList(); - - builder.addInclusionFilterAny(filters); - } - - return builders; - } - - private QueryFilterStepIf createFilter(QueryBranchWalker helper) { - var selfCondition = combinedIndexReader.hasWordFull(helper.termId); - if (helper.atEnd()) - return selfCondition; - - var nextSteps = helper.next(); - var nextFilters = nextSteps.stream() - .map(this::createFilter) - .map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter))) - .collect(Collectors.toList()); - - if (nextFilters.isEmpty()) - return selfCondition; - - if (nextFilters.size() == 1) - return nextFilters.getFirst(); - - - return new QueryFilterAnyOf(nextFilters); - } - public List createQueries(SearchTerms terms, QueryParams params) { if (!isLoaded()) { @@ -176,29 +116,99 @@ public class StatefulIndex { return Collections.emptyList(); } - final long[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); - final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); - List queryHeads = new ArrayList<>(10); - queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes)); - queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio)); + final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords); + List paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery()); - List queries = new ArrayList<>(10); + // Remove any paths that do not contain all prioritized terms, as this means + // the term is missing from the index and can never be found + paths.removeIf(containsAll(termPriority).negate()); + List helpers = QueryBranchWalker.create(termPriority, paths); + + for (var helper : helpers) { + for (var builder : List.of( + combinedIndexReader.findPriorityWord(helper.termId), + combinedIndexReader.findFullWord(helper.termId) + )) + { + queryHeads.add(builder); + + if (helper.atEnd()) + continue; + + List filterSteps = new ArrayList<>(); + for (var step : helper.next()) { + filterSteps.add(createFilter(step, 0)); + } + builder.addInclusionFilterAny(filterSteps); + } + } + + List ret = new ArrayList<>(10); + + // Add additional conditions to the query heads for (var query : queryHeads) { + // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing + for (long term : terms.advice()) { + query = query.alsoFull(term); + } + for (long term : terms.excludes()) { query = query.notFull(term); } // Run these filter steps last, as they'll worst-case cause as many page faults as there are // items in the buffer - queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); + ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); } - return queries; + return ret; + } + + /** Recursively create a filter step based on the QBW and its children */ + private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) { + final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth); + + var childSteps = walker.next(); + + if (childSteps.isEmpty()) + return ownFilterCondition; + + List combinedFilters = new ArrayList<>(); + + for (var step : childSteps) { + // Recursion will be limited to a fairly shallow stack depth due to how the queries are constructed. + var childFilter = createFilter(step, depth+1); + combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter)); + } + + if (combinedFilters.size() == 1) + return combinedFilters.getFirst(); + else + return new QueryFilterAnyOf(combinedFilters); + } + + /** Create a filter condition based on the termId associated with the QBW */ + private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) { + if (depth < 2) { + // At shallow depths we prioritize terms that appear in the priority index, + // to increase the odds we find "good" results before the sand runs out + return new QueryFilterAnyOf( + combinedIndexReader.hasWordPrio(walker.termId), + combinedIndexReader.hasWordFull(walker.termId) + ); + } else { + return combinedIndexReader.hasWordFull(walker.termId); + } + } + + private Predicate containsAll(long[] permitted) { + LongSet permittedTerms = new LongOpenHashSet(permitted); + return permittedTerms::containsAll; } private int compareKeywords(long a, long b) { @@ -208,13 +218,6 @@ public class StatefulIndex { ); } - private int compareKeywordsPrio(long a, long b) { - return Long.compare( - combinedIndexReader.numHitsPrio(a), - combinedIndexReader.numHitsPrio(b) - ); - } - /** Return an array of encoded document metadata longs corresponding to the * document identifiers provided; with metadata for termId. The input array * docs[] *must* be sorted. diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index 307e4179..8115c109 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -3,54 +3,35 @@ package nu.marginalia.index.model; import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.query.SearchQuery; import java.util.ArrayList; import java.util.List; -import java.util.Objects; import static nu.marginalia.index.model.SearchTermsUtil.getWordId; public final class SearchTerms { - private final LongList includes; + private final LongList advice; private final LongList excludes; private final LongList priority; private final List coherences; private final CompiledQueryLong compiledQueryIds; - public SearchTerms( - LongList includes, - LongList excludes, - LongList priority, - List coherences, - CompiledQueryLong compiledQueryIds - ) { - this.includes = includes; - this.excludes = excludes; - this.priority = priority; - this.coherences = coherences; + public SearchTerms(SearchQuery query, + CompiledQueryLong compiledQueryIds) + { + this.excludes = new LongArrayList(); + this.priority = new LongArrayList(); + this.coherences = new ArrayList<>(); + this.advice = new LongArrayList(); this.compiledQueryIds = compiledQueryIds; - } - public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) { - this(new LongArrayList(), - new LongArrayList(), - new LongArrayList(), - new ArrayList<>(), - compiledQueryIds); - - for (var word : query.searchTermsInclude) { - includes.add(getWordId(word)); - } for (var word : query.searchTermsAdvice) { - // This looks like a bug, but it's not - includes.add(getWordId(word)); + advice.add(getWordId(word)); } - for (var coherence : query.searchTermCoherences) { LongList parts = new LongArrayList(coherence.size()); @@ -64,36 +45,29 @@ public final class SearchTerms { for (var word : query.searchTermsExclude) { excludes.add(getWordId(word)); } + for (var word : query.searchTermsPriority) { priority.add(getWordId(word)); } } public boolean isEmpty() { - return includes.isEmpty(); + return compiledQueryIds.isEmpty(); } public long[] sortedDistinctIncludes(LongComparator comparator) { - if (includes.isEmpty()) - return includes.toLongArray(); - - LongList list = new LongArrayList(new LongOpenHashSet(includes)); + LongList list = new LongArrayList(compiledQueryIds.copyData()); list.sort(comparator); return list.toLongArray(); } - public int size() { - return includes.size() + excludes.size() + priority.size(); - } - - public LongList includes() { - return includes; - } public LongList excludes() { return excludes; } - + public LongList advice() { + return advice; + } public LongList priority() { return priority; } @@ -104,29 +78,4 @@ public final class SearchTerms { public CompiledQueryLong compiledQuery() { return compiledQueryIds; } - @Override - public boolean equals(Object obj) { - if (obj == this) return true; - if (obj == null || obj.getClass() != this.getClass()) return false; - var that = (SearchTerms) obj; - return Objects.equals(this.includes, that.includes) && - Objects.equals(this.excludes, that.excludes) && - Objects.equals(this.priority, that.priority) && - Objects.equals(this.coherences, that.coherences); - } - - @Override - public int hashCode() { - return Objects.hash(includes, excludes, priority, coherences); - } - - @Override - public String toString() { - return "SearchTerms[" + - "includes=" + includes + ", " + - "excludes=" + excludes + ", " + - "priority=" + priority + ", " + - "coherences=" + coherences + ']'; - } - } diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java index 8c20fe98..e9725179 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java @@ -2,14 +2,28 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; +import java.util.ArrayList; import java.util.List; import java.util.StringJoiner; public class QueryFilterAllOf implements QueryFilterStepIf { - private final List steps; + private final List steps; public QueryFilterAllOf(List steps) { - this.steps = steps; + this.steps = new ArrayList<>(steps.size()); + + for (var step : steps) { + if (step instanceof QueryFilterAllOf allOf) { + this.steps.addAll(allOf.steps); + } + else { + this.steps.add(step); + } + } + } + + public QueryFilterAllOf(QueryFilterStepIf... steps) { + this(List.of(steps)); } public double cost() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java index 2d177645..bea62194 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java @@ -2,14 +2,27 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; +import java.util.ArrayList; import java.util.List; import java.util.StringJoiner; public class QueryFilterAnyOf implements QueryFilterStepIf { - private final List steps; + private final List steps; public QueryFilterAnyOf(List steps) { - this.steps = steps; + this.steps = new ArrayList<>(steps.size()); + + for (var step : steps) { + if (step instanceof QueryFilterAnyOf anyOf) { + this.steps.addAll(anyOf.steps); + } else { + this.steps.add(step); + } + } + } + + public QueryFilterAnyOf(QueryFilterStepIf... steps) { + this(List.of(steps)); } public double cost() { @@ -30,23 +43,37 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { if (steps.isEmpty()) return; + if (steps.size() == 1) { + steps.getFirst().apply(buffer); + return; + } + int start = 0; - int end = buffer.end; + final int endOfValidData = buffer.end; // End of valid data range + + // The filters act as a partitioning function, where anything before buffer.end + // is "in", and is guaranteed to be sorted; and anything after buffer.end is "out" + // but no sorting guaranteed is provided. + + // To provide a conditional filter, we re-sort the "out" range, slice it and apply filtering to the slice for (var step : steps) { - var slice = buffer.slice(start, end); + var slice = buffer.slice(start, endOfValidData); slice.data.quickSort(0, slice.size()); step.apply(slice); start += slice.end; } - buffer.data.quickSort(0, start); - - // Special finalization + // After we're done, read and write pointers should be 0 and "end" should be the length of valid data, + // normally done through buffer.finalizeFiltering(); but that won't work here buffer.reset(); buffer.end = start; + + // After all filters have been applied, we must re-sort all the retained data + // to uphold the sortedness contract + buffer.data.quickSort(0, buffer.end); } public String describe() { diff --git a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java index d5b44389..a0312d36 100644 --- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -133,12 +133,6 @@ public class LongQueryBuffer { write = 0; } - public void finalizeFiltering(int pos) { - end = write; - read = pos; - write = pos; - } - /** Retain only unique values in the buffer, and update the end pointer to the new length. *

    * The buffer is assumed to be sorted up until the end pointer. From b770a1143f1948404068638a352e27075fa9e1b5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:03:04 +0100 Subject: [PATCH 22/47] (run) Fix traefik middleware configuration --- run/install/docker-compose-barebones-1.yml.template | 6 ++---- run/install/docker-compose-barebones-2.yml.template | 6 ++---- run/install/docker-compose-marginalia.yml.template | 12 ++++-------- 3 files changed, 8 insertions(+), 16 deletions(-) diff --git a/run/install/docker-compose-barebones-1.yml.template b/run/install/docker-compose-barebones-1.yml.template index 27a13a0a..e4c71a6b 100644 --- a/run/install/docker-compose-barebones-1.yml.template +++ b/run/install/docker-compose-barebones-1.yml.template @@ -50,8 +50,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.search-service.rule=PathPrefix(`/`)" - "traefik.http.routers.search-service.entrypoints=search" - - "traefik.http.routers.search-service.middlewares=add-xpublic" - - "traefik.http.routers.search-service.middlewares=add-public" + - "traefik.http.routers.search-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" control-service: @@ -67,8 +66,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.control-service.rule=PathPrefix(`/`)" - "traefik.http.routers.control-service.entrypoints=control" - - "traefik.http.routers.control-service.middlewares=add-xpublic" - - "traefik.http.routers.control-service.middlewares=add-public" + - "traefik.http.routers.control-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" mariadb: diff --git a/run/install/docker-compose-barebones-2.yml.template b/run/install/docker-compose-barebones-2.yml.template index b47d5b00..ab9d0a08 100644 --- a/run/install/docker-compose-barebones-2.yml.template +++ b/run/install/docker-compose-barebones-2.yml.template @@ -78,8 +78,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.search-service.rule=PathPrefix(`/`)" - "traefik.http.routers.search-service.entrypoints=search" - - "traefik.http.routers.search-service.middlewares=add-xpublic" - - "traefik.http.routers.search-service.middlewares=add-public" + - "traefik.http.routers.search-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" control-service: @@ -95,8 +94,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.control-service.rule=PathPrefix(`/`)" - "traefik.http.routers.control-service.entrypoints=control" - - "traefik.http.routers.control-service.middlewares=add-xpublic" - - "traefik.http.routers.control-service.middlewares=add-public" + - "traefik.http.routers.control-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" mariadb: diff --git a/run/install/docker-compose-marginalia.yml.template b/run/install/docker-compose-marginalia.yml.template index b92543e7..b8eb89c0 100644 --- a/run/install/docker-compose-marginalia.yml.template +++ b/run/install/docker-compose-marginalia.yml.template @@ -82,8 +82,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.search-service.rule=PathPrefix(`/`)" - "traefik.http.routers.search-service.entrypoints=search" - - "traefik.http.routers.search-service.middlewares=add-xpublic" - - "traefik.http.routers.search-service.middlewares=add-public" + - "traefik.http.routers.search-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" assistant-service: @@ -100,8 +99,7 @@ services: - "traefik.http.routers.assistant-service-screenshot.middlewares=add-public" - "traefik.http.routers.assistant-service-suggest.rule=PathPrefix(`/suggest`)" - "traefik.http.routers.assistant-service-suggest.entrypoints=search" - - "traefik.http.routers.assistant-service-suggest.middlewares=add-xpublic" - - "traefik.http.routers.assistant-service-suggest.middlewares=add-public" + - "traefik.http.routers.assistant-service-suggest.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" api-service: @@ -114,8 +112,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.api-service.rule=PathPrefix(`/`)" - "traefik.http.routers.api-service.entrypoints=api" - - "traefik.http.routers.api-service.middlewares=add-xpublic" - - "traefik.http.routers.api-service.middlewares=add-public" + - "traefik.http.routers.api-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" control-service: @@ -131,8 +128,7 @@ services: - "traefik.enable=true" - "traefik.http.routers.control-service.rule=PathPrefix(`/`)" - "traefik.http.routers.control-service.entrypoints=control" - - "traefik.http.routers.control-service.middlewares=add-xpublic" - - "traefik.http.routers.control-service.middlewares=add-public" + - "traefik.http.routers.control-service.middlewares=add-xpublic,add-public" - "traefik.http.middlewares.add-xpublic.headers.customrequestheaders.X-Public=1" - "traefik.http.middlewares.add-public.addprefix.prefix=/public" mariadb: From 617e633d7a3509e12c2e7702dc1903873454f1bc Mon Sep 17 00:00:00 2001 From: Joshua Holland Date: Mon, 1 Apr 2024 00:04:12 -0500 Subject: [PATCH 23/47] Update keywords docs use of explore to browse I can't tell when this happened, but the proper keyword now seems to be browse and not explore. --- .../resources/templates/search/parts/search-footer.hdb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/services-application/search-service/resources/templates/search/parts/search-footer.hdb b/code/services-application/search-service/resources/templates/search/parts/search-footer.hdb index 771ebd91..747e7dd0 100644 --- a/code/services-application/search-service/resources/templates/search/parts/search-footer.hdb +++ b/code/services-application/search-service/resources/templates/search/parts/search-footer.hdb @@ -41,7 +41,7 @@ site:example.comDisplay site information about example.com site:example.com keywordSearch example.com for keyword - explore:example.comShow similar websites to example.com + browse:example.comShow similar websites to example.com ip:127.0.0.1Search documents hosted at 127.0.0.1 links:example.comSearch documents linking to example.com From 5766da69ec30adf4bf193b862341d64d899c1844 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Apr 2024 15:12:27 +0200 Subject: [PATCH 24/47] (gradle) Upgrade to Gradle 8.7 This will reduce the hassle of juggling JDK versions for JDK 22, which was not supported by Gradle 8.5. --- gradle/wrapper/gradle-wrapper.properties | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index a5952066..48c0a02c 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.5-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists From 7f7021ce64264d4a20ae2dbb744c711ab9f94b1d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Apr 2024 18:52:58 +0200 Subject: [PATCH 25/47] (sentence-extractor) Fix resource leak in sentence extractor The code would always re-initialize the static ngramLexicon and rdrposTagger fields with new instances even if they were already instantiated, leading to a ton of unnecessary RAM allocation. The modified behavior checks for nullity before creating a new instance. --- .../language/sentence/SentenceExtractor.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index fd15660f..bb1e3771 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -60,13 +60,16 @@ public class SentenceExtractor { } synchronized (this) { - ngramLexicon = new NgramLexicon(models); - - try { - rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); + if (ngramLexicon == null) { + ngramLexicon = new NgramLexicon(models); } - catch (Exception ex) { - throw new IllegalStateException(ex); + + if (rdrposTagger == null) { + try { + rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); + } catch (Exception ex) { + throw new IllegalStateException(ex); + } } } From ef25d6066606fd11fe6ae266c75440fce9578d19 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 6 Apr 2024 13:28:14 +0200 Subject: [PATCH 26/47] (index) Add origin trace information for index readers This used to be supported by the system but got lost in refactoring at some point. --- .../nu/marginalia/index/ReverseIndexEntrySource.java | 7 +++++-- .../java/nu/marginalia/index/ReverseIndexReader.java | 9 ++++++--- .../test/nu/marginalia/index/ReverseIndexReaderTest.java | 2 +- code/index/java/nu/marginalia/index/IndexFactory.java | 4 ++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java index 7c12563b..851bf9ab 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java @@ -7,6 +7,7 @@ import nu.marginalia.index.query.EntrySource; import static java.lang.Math.min; public class ReverseIndexEntrySource implements EntrySource { + private final String name; private final BTreeReader reader; int pos; @@ -15,9 +16,11 @@ public class ReverseIndexEntrySource implements EntrySource { final int entrySize; private final long wordId; - public ReverseIndexEntrySource(BTreeReader reader, + public ReverseIndexEntrySource(String name, + BTreeReader reader, int entrySize, long wordId) { + this.name = name; this.reader = reader; this.entrySize = entrySize; this.wordId = wordId; @@ -60,6 +63,6 @@ public class ReverseIndexEntrySource implements EntrySource { @Override public String indexName() { - return "Full:" + Long.toHexString(wordId); + return name + ":" + Long.toHexString(wordId); } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java index f37420dd..e37de80d 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java @@ -25,8 +25,11 @@ public class ReverseIndexReader { private final long wordsDataOffset; private final Logger logger = LoggerFactory.getLogger(getClass()); private final BTreeReader wordsBTreeReader; + private final String name; + + public ReverseIndexReader(String name, Path words, Path documents) throws IOException { + this.name = name; - public ReverseIndexReader(Path words, Path documents) throws IOException { if (!Files.exists(words) || !Files.exists(documents)) { this.words = null; this.documents = null; @@ -84,7 +87,7 @@ public class ReverseIndexReader { if (offset < 0) return new EmptyEntrySource(); - return new ReverseIndexEntrySource(createReaderNew(offset), 2, wordId); + return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId); } public QueryFilterStepIf also(long wordId) { @@ -92,7 +95,7 @@ public class ReverseIndexReader { if (offset < 0) return new QueryFilterNoPass(); - return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId); + return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId); } public QueryFilterStepIf not(long wordId) { diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java index e6b76249..ed8b4193 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java @@ -102,7 +102,7 @@ class ReverseIndexReaderTest { preindex.finalizeIndex(docsFile, wordsFile); preindex.delete(); - return new ReverseIndexReader(wordsFile, docsFile); + return new ReverseIndexReader("test", wordsFile, docsFile); } } \ No newline at end of file diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java index 48911546..a1d2f5a5 100644 --- a/code/index/java/nu/marginalia/index/IndexFactory.java +++ b/code/index/java/nu/marginalia/index/IndexFactory.java @@ -41,14 +41,14 @@ public class IndexFactory { public ReverseIndexReader getReverseIndexReader() throws IOException { - return new ReverseIndexReader( + return new ReverseIndexReader("full", ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT) ); } public ReverseIndexReader getReverseIndexPrioReader() throws IOException { - return new ReverseIndexReader( + return new ReverseIndexReader("prio", ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT) ); From dbdcf459a7ae693baa849b4eaad594640e90c4f2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 6 Apr 2024 14:34:15 +0200 Subject: [PATCH 27/47] (minor) Remove dead code --- .../ranking/results/ResultKeywordSet.java | 26 ----------- .../ranking/results/ResultValuator.java | 45 ------------------- .../search/model/ClusteredUrlDetails.java | 2 +- 3 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java diff --git a/code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java b/code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java deleted file mode 100644 index 19405dcb..00000000 --- a/code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.ranking.results; - - -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; - -import java.util.List; - -public record ResultKeywordSet(List keywords) { - - public int length() { - return keywords.size(); - } - public boolean isEmpty() { return length() == 0; } - public boolean hasNgram() { - for (var word : keywords) { - if (word.keyword.contains("_")) { - return true; - } - } - return false; - } - @Override - public String toString() { - return "%s[%s]".formatted(getClass().getSimpleName(), keywords); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 05ff83d2..862978c9 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -147,51 +147,6 @@ public class ResultValuator { return (int) -penalty; } - private long documentMetadata(List rawScores) { - for (var score : rawScores) { - return score.encodedDocMetadata(); - } - return 0; - } - - private int htmlFeatures(List rawScores) { - for (var score : rawScores) { - return score.htmlFeatures(); - } - return 0; - } - - private ResultKeywordSet createKeywordSet(List rawScores, - int thisSet) - { - List scoresList = new ArrayList<>(); - - for (var score : rawScores) { - if (score.subquery != thisSet) - continue; - - // Don't consider synthetic keywords for ranking, these are keywords that don't - // have counts. E.g. "tld:edu" - if (score.isKeywordSpecial()) - continue; - - scoresList.add(score); - } - - return new ResultKeywordSet(scoresList); - - } - - private int numberOfSets(List scores) { - int maxSet = 0; - - for (var score : scores) { - maxSet = Math.max(maxSet, score.subquery); - } - - return 1 + maxSet; - } - public static double normalize(double value, double penalty) { if (value < 0) value = 0; diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index 6dd7390d..6abe7cd1 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -46,7 +46,7 @@ public class ClusteredUrlDetails implements Comparable { return urlDetails.resultItem.keywordScores.stream() .filter(score -> !score.keyword.contains(":")) .collect(Collectors.toMap( - score -> score.subquery, + score -> -1, // FIXME score -> score.hasTermFlag(WordFlags.Title) | score.hasTermFlag(WordFlags.ExternalLink) | score.hasTermFlag(WordFlags.UrlDomain) From fcdc843c15cfe3ef212408f8dfcd6b89a57090d1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 7 Apr 2024 11:24:30 +0200 Subject: [PATCH 28/47] (search) Fix outdated assumptions about the results We no longer break the query into "sets" of search terms and need to adapt the code to not use this assumption. For the API service, we'll simulate the old behavior to keep the API stable. For the search service, we'll introduce a new way of calculating positions through tree aggregation. --- .../api/searchquery/QueryProtobufCodec.java | 2 + .../aggregate/CompiledQueryAggregates.java | 5 ++ .../aggregate/CqPositionsOperator.java | 79 +++++++++++++++++++ .../results/DecoratedSearchResultItem.java | 3 + .../results/SearchResultKeywordScore.java | 12 +-- .../api/src/main/protobuf/query-api.proto | 1 + .../nu/marginalia/index/IndexGrpcService.java | 1 + .../results/IndexResultValuatorService.java | 21 +++++ .../nu/marginalia/api/ApiSearchOperator.java | 26 +++--- .../search/model/ClusteredUrlDetails.java | 63 +++++++-------- .../search/svc/SearchQueryIndexService.java | 25 +----- 11 files changed, 165 insertions(+), 73 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index f0113870..b705917e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -121,6 +121,7 @@ public class QueryProtobufCodec { results.getPubYear(), // ??, results.getDataHash(), results.getWordsTotal(), + results.getBestPositions(), results.getRankingScore() ); } @@ -202,6 +203,7 @@ public class QueryProtobufCodec { rpcDecoratedResultItem.getPubYear(), rpcDecoratedResultItem.getDataHash(), rpcDecoratedResultItem.getWordsTotal(), + rpcDecoratedResultItem.getBestPositions(), rpcDecoratedResultItem.getRankingScore() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 9c4abe72..0ab0647d 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -44,4 +44,9 @@ public class CompiledQueryAggregates { public static List queriesAggregate(CompiledQueryLong query) { return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); } + + /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ + public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { + return query.root().visit(new CqPositionsOperator(query, operator)); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java new file mode 100644 index 00000000..19db2d4b --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java @@ -0,0 +1,79 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToLongFunction; +import java.util.function.ToLongFunction; + +public class CqPositionsOperator implements CqExpression.ObjectVisitor { + private final IntToLongFunction operator; + + public CqPositionsOperator(CompiledQuery query, ToLongFunction operator) { + this.operator = idx -> operator.applyAsLong(query.at(idx)); + } + + @Override + public LongSet onAnd(List parts) { + LongSet ret = new LongArraySet(); + + for (var part : parts) { + ret = comineSets(ret, part.visit(this)); + } + + return ret; + } + + private LongSet comineSets(LongSet a, LongSet b) { + if (a.isEmpty()) + return b; + if (b.isEmpty()) + return a; + + LongSet ret = newSet(a.size() * b.size()); + + var ai = a.longIterator(); + + while (ai.hasNext()) { + long aval = ai.nextLong(); + + var bi = b.longIterator(); + while (bi.hasNext()) { + ret.add(aval & bi.nextLong()); + } + } + + return ret; + } + + @Override + public LongSet onOr(List parts) { + LongSet ret = newSet(parts.size()); + + for (var part : parts) { + ret.addAll(part.visit(this)); + } + + return ret; + } + + @Override + public LongSet onLeaf(int idx) { + var set = newSet(1); + set.add(operator.applyAsLong(idx)); + return set; + } + + /** Allocate a new set suitable for a collection with the provided cardinality */ + private LongSet newSet(int cardinality) { + if (cardinality < 8) + return new LongArraySet(cardinality); + else + return new LongOpenHashSet(cardinality); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java index b099dc01..df48ea64 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java @@ -30,6 +30,7 @@ public class DecoratedSearchResultItem implements Comparable next.termScore <= scoreLimit) - .toList(); - } + this.rest.removeIf(urlDetail -> { + if (urlDetail.termScore > scoreLimit) + return false; + + for (var keywordScore : urlDetail.resultItem.keywordScores) { + if (keywordScore.isKeywordSpecial()) + continue; + if (keywordScore.positionCount() == 0) + continue; + + if (keywordScore.hasTermFlag(WordFlags.Title)) + return false; + if (keywordScore.hasTermFlag(WordFlags.ExternalLink)) + return false; + if (keywordScore.hasTermFlag(WordFlags.UrlDomain)) + return false; + if (keywordScore.hasTermFlag(WordFlags.UrlPath)) + return false; + if (keywordScore.hasTermFlag(WordFlags.Subjects)) + return false; + } + + return true; + }); } - private boolean isEligbleForInclusion(UrlDetails urlDetails) { - return urlDetails.resultItem.keywordScores.stream() - .filter(score -> !score.keyword.contains(":")) - .collect(Collectors.toMap( - score -> -1, // FIXME - score -> score.hasTermFlag(WordFlags.Title) - | score.hasTermFlag(WordFlags.ExternalLink) - | score.hasTermFlag(WordFlags.UrlDomain) - | score.hasTermFlag(WordFlags.UrlPath) - | score.hasTermFlag(WordFlags.Subjects) - , - (a, b) -> a && b - )) - .containsValue(Boolean.TRUE); - } public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) { this.first = onlyFirst; diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 785c8952..6dc7b83b 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -88,7 +88,7 @@ public class SearchQueryIndexService { DomainIndexingState.ACTIVE, detail.rankingScore, // termScore detail.resultsFromDomain(), - getPositionsString(detail.rawIndexResult), + getPositionsString(detail), detail.rawIndexResult, detail.rawIndexResult.keywordScores )); @@ -97,27 +97,8 @@ public class SearchQueryIndexService { return ret; } - private String getPositionsString(SearchResultItem resultItem) { - Int2LongArrayMap positionsPerSet = new Int2LongArrayMap(8); - - for (var score : resultItem.keywordScores) { - if (!score.isKeywordRegular()) { - continue; - } - positionsPerSet.merge(score.subquery(), score.positions(), this::and); - } - - long bits = positionsPerSet.values().longStream().reduce(this::or).orElse(0); - - return BrailleBlockPunchCards.printBits(bits, 56); + private String getPositionsString(DecoratedSearchResultItem resultItem) { + return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56); } - - private long and(long a, long b) { - return a & b; - } - private long or(long a, long b) { - return a | b; - } - } From 4b47fadbab24ffcd4a2b765eb32d1a5c8be1bbc2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 10 Apr 2024 16:58:05 +0200 Subject: [PATCH 29/47] (term-freq-exporter) Extract ngrams in term-frequency-exporter --- .../java/nu/marginalia/extractor/TermFrequencyExporter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index df1e56a9..bdb7362a 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -127,6 +127,10 @@ public class TermFrequencyExporter implements ExporterIf { for (var word : sent) { words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } + + for (var ngram : sent.ngramStemmed) { + words.add(longHash(ngram.getBytes())); + } } synchronized (counts) { From c538c25008b66cb2a69baf677227244be70d7e42 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 10 Apr 2024 17:11:23 +0200 Subject: [PATCH 30/47] (term-freq-exporter) Reduce thread count and memory usage --- .../data-extractors/build.gradle | 1 + .../extractor/TermFrequencyExporter.java | 27 +++++++++---------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle index 73aebd49..69ae1388 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/features-convert/data-extractors/build.gradle @@ -21,6 +21,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') + implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:anchor-keywords') implementation project(':code:process-models:crawling-model') diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index bdb7362a..1e1a2cd5 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -14,6 +14,7 @@ import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.util.SimpleBlockingThreadPool; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; @@ -53,27 +54,23 @@ public class TermFrequencyExporter implements ExporterIf { TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); AtomicInteger docCount = new AtomicInteger(); - try (ForkJoinPool fjp = new ForkJoinPool(Math.max(2, Runtime.getRuntime().availableProcessors() / 2))) { + SimpleBlockingThreadPool sjp = new SimpleBlockingThreadPool("exporter", Math.clamp(2, 16, Runtime.getRuntime().availableProcessors() / 2), 4); + Path crawlerLogFile = inputDir.resolve("crawler.log"); - Path crawlerLogFile = inputDir.resolve("crawler.log"); + for (var item : WorkLog.iterable(crawlerLogFile)) { + if (Thread.interrupted()) { + sjp.shutDownNow(); - for (var item : WorkLog.iterable(crawlerLogFile)) { - if (Thread.interrupted()) { - fjp.shutdownNow(); - - throw new InterruptedException(); - } - - Path crawlDataPath = inputDir.resolve(item.relPath()); - fjp.execute(() -> processFile(crawlDataPath, counts, docCount, se.get())); + throw new InterruptedException(); } - while (!fjp.isQuiescent()) { - if (fjp.awaitQuiescence(10, TimeUnit.SECONDS)) - break; - } + Path crawlDataPath = inputDir.resolve(item.relPath()); + sjp.submitQuietly(() -> processFile(crawlDataPath, counts, docCount, se.get())); } + sjp.shutDown(); + sjp.awaitTermination(10, TimeUnit.DAYS); + var tmpFile = Files.createTempFile(destStorage.asPath(), "freqs", ".dat.tmp", PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); From ed73d79ec1563be88fe8b035baadcfa743dc647e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 17:20:13 +0200 Subject: [PATCH 31/47] (qs) Clean up parsing code using new record matching --- .../searchquery/model/query/SearchQuery.java | 2 +- .../query_parser/ExpansionStrategy.java | 7 - .../query_parser/QueryExpansion.java | 10 + .../searchquery/query_parser/QueryParser.java | 158 +++++++++------ .../query_parser/QueryTokenizer.java | 31 +-- .../query_parser/token/QueryToken.java | 86 ++++++++ .../searchquery/query_parser/token/Token.java | 49 ----- .../query_parser/token/TokenType.java | 34 ---- .../query_parser/token/TokenVisitor.java | 14 -- .../searchquery/svc/QueryFactory.java | 183 +++++++++++++----- .../svc/QueryLimitsAccumulator.java | 93 --------- .../svc/QuerySearchTermsAccumulator.java | 105 ---------- .../util/transform_list/TransformList.java | 9 + 13 files changed, 349 insertions(+), 432 deletions(-) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index 9dd10396..ffe02868 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -72,7 +72,7 @@ public class SearchQuery { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery); + if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", "); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java deleted file mode 100644 index 20ebffd1..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; - -public interface ExpansionStrategy { - void expand(QWordGraph graph); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 6415751b..052516d8 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -15,6 +15,9 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; +/** Responsible for expanding a query, that is creating alternative branches of query execution + * to increase the number of results + */ public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); private final TermFrequencyDict dict; @@ -94,6 +97,10 @@ public class QueryExpansion { } } + /** Create an alternative interpretation of the query that replaces a sequence of words + * with a word n-gram. This makes it so that when possible, the order of words in the document + * matches the order of the words in the query. + */ public void createSegments(QWordGraph graph) { List nodes = new ArrayList<>(); @@ -115,4 +122,7 @@ public class QueryExpansion { } } + public interface ExpansionStrategy { + void expand(QWordGraph graph); + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index bbaf5c87..3f92a594 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -1,8 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.WordPatterns; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; import nu.marginalia.util.transform_list.TransformList; import java.util.List; @@ -11,95 +10,126 @@ public class QueryParser { private final QueryTokenizer tokenizer = new QueryTokenizer(); - public List parse(String query) { - List basicTokens = tokenizer.tokenizeQuery(query); + public List parse(String query) { + List basicTokens = tokenizer.tokenizeQuery(query); - TransformList list = new TransformList<>(basicTokens); + TransformList list = new TransformList<>(basicTokens); list.transformEach(QueryParser::handleQuoteTokens); list.transformEach(QueryParser::trimLiterals); list.transformEachPair(QueryParser::createNegatedTerms); list.transformEachPair(QueryParser::createPriorityTerms); list.transformEach(QueryParser::handleSpecialOperations); - list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms); + list.scanAndTransform(QueryToken.LParen.class::isInstance, QueryToken.RParen.class::isInstance, QueryParser::handleAdvisoryTerms); + list.transformEach(QueryParser::normalizeDomainName); return list.getBackingList(); } - private static void handleQuoteTokens(TransformList.Entity entity) { - var t = entity.value(); - if (t.type == TokenType.QUOT) { - entity.replace(new Token(TokenType.QUOT_TERM, - t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), - t.displayStr)); - } - } - - private static void trimLiterals(TransformList.Entity entity) { + private static void normalizeDomainName(TransformList.Entity entity) { var t = entity.value(); - if (t.type == TokenType.LITERAL_TERM - && (t.str.endsWith(":") || t.str.endsWith(".")) - && t.str.length() > 1) { - entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr)); + if (!(t instanceof QueryToken.LiteralTerm)) + return; + + if (t.str().startsWith("site:")) { + entity.replace(new QueryToken.LiteralTerm(t.str().toLowerCase(), t.displayStr())); } } - private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value(); - var tn = second.value(); - - if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str)); - } - } - - private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value(); - var tn = second.value(); - - if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str)); - } - } - - private static void handleSpecialOperations(TransformList.Entity entity) { + private static void handleQuoteTokens(TransformList.Entity entity) { var t = entity.value(); - if (t.type != TokenType.LITERAL_TERM) { + + if (!(t instanceof QueryToken.Quot)) { return; } - if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) { - entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr)); - } else if (t.str.startsWith("near:")) { - entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); - } else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) { - entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { - entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { - entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("qs=")) { - entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); - } else if (t.str.contains(":")) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); - } + entity.replace(new QueryToken.QuotTerm( + t.str().replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), + t.displayStr())); } - private static void handleAdvisoryTerms(TransformList.Entity entity) { + private static void trimLiterals(TransformList.Entity entity) { var t = entity.value(); - if (t.type == TokenType.LPAREN) { - entity.remove(); - } else if (t.type == TokenType.RPAREN) { - entity.remove(); - } else if (t.type == TokenType.LITERAL_TERM) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")")); + + if (!(t instanceof QueryToken.LiteralTerm lt)) + return; + + String str = lt.str(); + if (str.isBlank()) + return; + + if (str.endsWith(":") || str.endsWith(".")) { + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); + } + + } + + private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (!(t instanceof QueryToken.Minus)) + return; + if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm)) + return; + + first.remove(); + + second.replace(new QueryToken.ExcludeTerm(tn.str(), "-" + tn.displayStr())); + } + + private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (!(t instanceof QueryToken.QMark)) + return; + if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm)) + return; + + var replacement = new QueryToken.PriorityTerm(tn.str(), "?" + tn.displayStr()); + + first.remove(); + second.replace(replacement); + } + + private static void handleSpecialOperations(TransformList.Entity entity) { + var t = entity.value(); + if (!(t instanceof QueryToken.LiteralTerm)) { + return; + } + + String str = t.str(); + + if (str.startsWith("q") && str.matches("q[=><]\\d+")) { + entity.replace(new QueryToken.QualityTerm(str.substring(1))); + } else if (str.startsWith("near:")) { + entity.replace(new QueryToken.NearTerm(str.substring(5))); + } else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) { + entity.replace(new QueryToken.YearTerm(str.substring(4))); + } else if (str.startsWith("size") && str.matches("size[=><]\\d+")) { + entity.replace(new QueryToken.SizeTerm(str.substring(4))); + } else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) { + entity.replace(new QueryToken.RankTerm(str.substring(4))); + } else if (str.startsWith("qs=")) { + entity.replace(new QueryToken.QsTerm(str.substring(3))); + } else if (str.contains(":")) { + entity.replace(new QueryToken.AdviceTerm(str, t.displayStr())); } } + private static void handleAdvisoryTerms(TransformList.Entity entity) { + var t = entity.value(); + if (t instanceof QueryToken.LParen) { + entity.remove(); + } else if (t instanceof QueryToken.RParen) { + entity.remove(); + } else if (t instanceof QueryToken.LiteralTerm) { + entity.replace(new QueryToken.AdviceTerm(t.str(), "(" + t.displayStr() + ")")); + } + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java index b7b0a2b7..b12d68a9 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java @@ -1,7 +1,6 @@ package nu.marginalia.functions.searchquery.query_parser; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.encoding.AsciiFlattener; import java.util.ArrayList; @@ -11,8 +10,8 @@ import java.util.regex.Pattern; public class QueryTokenizer { private static final Pattern noisePattern = Pattern.compile("[,\\s]"); - public List tokenizeQuery(String rawQuery) { - List tokens = new ArrayList<>(); + public List tokenizeQuery(String rawQuery) { + List tokens = new ArrayList<>(); String query = AsciiFlattener.flattenUnicode(rawQuery); query = noisePattern.matcher(query).replaceAll(" "); @@ -21,26 +20,27 @@ public class QueryTokenizer { int chr = query.charAt(i); if ('(' == chr) { - tokens.add(new Token(TokenType.LPAREN, "(", "(")); + tokens.add(new QueryToken.LParen()); } else if (')' == chr) { - tokens.add(new Token(TokenType.RPAREN, ")", ")")); + tokens.add(new QueryToken.RParen()); } else if ('"' == chr) { int end = query.indexOf('"', i+1); + if (end == -1) { end = query.length(); } - tokens.add(new Token(TokenType.QUOT, - query.substring(i+1, end).toLowerCase(), - query.substring(i, Math.min(query.length(), end+1)))); + + tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); + i = end; } else if ('-' == chr) { - tokens.add(new Token(TokenType.MINUS, "-")); + tokens.add(new QueryToken.Minus()); } else if ('?' == chr) { - tokens.add(new Token(TokenType.QMARK, "?")); + tokens.add(new QueryToken.QMark()); } else if (Character.isSpaceChar(chr)) { // @@ -52,9 +52,12 @@ public class QueryTokenizer { if (query.charAt(end) == ' ' || query.charAt(end) == ')') break; } - tokens.add(new Token(TokenType.LITERAL_TERM, - query.substring(i, end).toLowerCase(), - query.substring(i, end))); + + String displayStr = query.substring(i, end); + String str = displayStr.toLowerCase(); + + tokens.add(new QueryToken.LiteralTerm(str, displayStr)); + i = end-1; } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java new file mode 100644 index 00000000..b11fe370 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java @@ -0,0 +1,86 @@ +package nu.marginalia.functions.searchquery.query_parser.token; + + +public sealed interface QueryToken { + String str(); + String displayStr(); + + record LiteralTerm(String str, String displayStr) implements QueryToken {} + record QuotTerm(String str, String displayStr) implements QueryToken {} + record ExcludeTerm(String str, String displayStr) implements QueryToken {} + record AdviceTerm(String str, String displayStr) implements QueryToken {} + record PriorityTerm(String str, String displayStr) implements QueryToken {} + + record QualityTerm(String str) implements QueryToken { + public String displayStr() { + return "q" + str; + } + } + record YearTerm(String str) implements QueryToken { + public String displayStr() { + return "year" + str; + } + } + record SizeTerm(String str) implements QueryToken { + public String displayStr() { + return "size" + str; + } + } + record RankTerm(String str) implements QueryToken { + public String displayStr() { + return "rank" + str; + } + } + record NearTerm(String str) implements QueryToken { + public String displayStr() { + return "near:" + str; + } + } + + record QsTerm(String str) implements QueryToken { + public String displayStr() { + return "qs" + str; + } + } + + record Quot(String str) implements QueryToken { + public String displayStr() { + return "\"" + str + "\""; + } + } + record Minus() implements QueryToken { + public String str() { + return "-"; + } + public String displayStr() { + return "-"; + } + } + record QMark() implements QueryToken { + public String str() { + return "?"; + } + public String displayStr() { + return "?"; + } + } + record LParen() implements QueryToken { + public String str() { + return "("; + } + public String displayStr() { + return "("; + } + } + record RParen() implements QueryToken { + public String str() { + return ")"; + } + public String displayStr() { + return ")"; + } + } + + record Ignore(String str, String displayStr) implements QueryToken {} + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java deleted file mode 100644 index 06c28972..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -import lombok.EqualsAndHashCode; -import lombok.ToString; -import lombok.With; - -@ToString -@EqualsAndHashCode -@With -public class Token { - public TokenType type; - public String str; - public final String displayStr; - - public Token(TokenType type, String str, String displayStr) { - this.type = type; - this.str = str; - this.displayStr = safeString(displayStr); - } - - - public Token(TokenType type, String str) { - this.type = type; - this.str = str; - this.displayStr = safeString(str); - } - - private static String safeString(String s) { - return s.replaceAll("<", "<") - .replaceAll(">", ">"); - } - - public void visit(TokenVisitor visitor) { - switch (type) { - case QUOT_TERM: visitor.onQuotTerm(this); break; - case EXCLUDE_TERM: visitor.onExcludeTerm(this); break; - case PRIORTY_TERM: visitor.onPriorityTerm(this); break; - case ADVICE_TERM: visitor.onAdviceTerm(this); break; - case LITERAL_TERM: visitor.onLiteralTerm(this); break; - - case YEAR_TERM: visitor.onYearTerm(this); break; - case RANK_TERM: visitor.onRankTerm(this); break; - case SIZE_TERM: visitor.onSizeTerm(this); break; - case QS_TERM: visitor.onQsTerm(this); break; - - case QUALITY_TERM: visitor.onQualityTerm(this); break; - } - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java deleted file mode 100644 index 85d55c35..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java +++ /dev/null @@ -1,34 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -import java.util.function.Predicate; - -public enum TokenType implements Predicate { - TERM, - - - LITERAL_TERM, - QUOT_TERM, - EXCLUDE_TERM, - ADVICE_TERM, - PRIORTY_TERM, - - QUALITY_TERM, - YEAR_TERM, - SIZE_TERM, - RANK_TERM, - NEAR_TERM, - - QS_TERM, - - QUOT, - MINUS, - QMARK, - LPAREN, - RPAREN, - - IGNORE; - - public boolean test(Token t) { - return t.type == this; - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java deleted file mode 100644 index 2e14f837..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -public interface TokenVisitor { - void onLiteralTerm(Token token); - void onQuotTerm(Token token); - void onExcludeTerm(Token token); - void onPriorityTerm(Token token); - void onAdviceTerm(Token token); - void onYearTerm(Token token); - void onSizeTerm(Token token); - void onRankTerm(Token token); - void onQualityTerm(Token token); - void onQsTerm(Token token); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 55467b4f..26af1bf4 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -6,18 +6,19 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; @Singleton @@ -46,31 +47,89 @@ public class QueryFactory { List searchTermsHuman = new ArrayList<>(); List problems = new ArrayList<>(); - List basicQuery = queryParser.parse(query); + List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { problems.add("Your search query is too long"); basicQuery.clear(); } + List searchTermsExclude = new ArrayList<>(); + List searchTermsInclude = new ArrayList<>(); + List searchTermsAdvice = new ArrayList<>(); + List searchTermsPriority = new ArrayList<>(); + List> searchTermCoherences = new ArrayList<>(); - QueryLimitsAccumulator qualityLimits = new QueryLimitsAccumulator(params); + SpecificationLimit qualityLimit = SpecificationLimit.none(); + SpecificationLimit year = SpecificationLimit.none(); + SpecificationLimit size = SpecificationLimit.none(); + SpecificationLimit rank = SpecificationLimit.none(); + QueryStrategy queryStrategy = QueryStrategy.AUTO; - for (Token t : basicQuery) { - if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { - if (t.str.startsWith("site:")) { - t.str = normalizeDomainName(t.str); + String domain = null; + + System.out.println(basicQuery); + + for (QueryToken t : basicQuery) { + switch (t) { + case QueryToken.QuotTerm(String str, String displayStr) -> { + analyzeSearchTerm(problems, str, displayStr); + searchTermsHuman.addAll(Arrays.asList(displayStr.replace("\"", "").split("\\s+"))); + + String[] parts = StringUtils.split(str, '_'); + + // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being + // required in the query (which is a problem because they are not indexed). How to do this + // in a clean way is a bit of an open problem that may not get resolved until query-parsing is + // improved. + + if (parts.length > 1 && !anyPartIsStopWord(parts)) { + // Prefer that the actual n-gram is present + searchTermsAdvice.add(str); + + // Require that the terms appear in the same sentence + searchTermCoherences.add(Arrays.asList(parts)); + + // Require that each term exists in the document + // (needed for ranking) + searchTermsInclude.addAll(Arrays.asList(parts)); + } + else { + searchTermsInclude.add(str); + } + } + case QueryToken.LiteralTerm(String str, String displayStr) -> { + analyzeSearchTerm(problems, str, displayStr); + searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+"))); + + searchTermsInclude.add(str); } - searchTermsHuman.addAll(toHumanSearchTerms(t)); - analyzeSearchTerm(problems, t); - } - t.visit(qualityLimits); + case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str); + case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str); + case QueryToken.AdviceTerm(String str, String displayStr) -> { + searchTermsAdvice.add(str); + + if (str.toLowerCase().startsWith("site:")) { + domain = str.substring("site:".length()); + } + } + + case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str); + case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str); + case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str); + case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str); + case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str); + + default -> {} + } } - QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); - String domain = termsAccumulator.domain; + if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { + searchTermsInclude.addAll(searchTermsAdvice); + searchTermsAdvice.clear(); + } List domainIds = params.domainIds(); @@ -80,29 +139,29 @@ public class QueryFactory { limits = limits.forSingleDomain(); } + var searchQuery = new SearchQuery( + queryExpansion.expandQuery( + searchTermsInclude + ), + searchTermsInclude, + searchTermsExclude, + searchTermsAdvice, + searchTermsPriority, + searchTermCoherences + ); + var specsBuilder = SearchSpecification.builder() - .query( - new SearchQuery( - queryExpansion.expandQuery( - termsAccumulator.searchTermsInclude - ), - termsAccumulator.searchTermsInclude, - termsAccumulator.searchTermsExclude, - termsAccumulator.searchTermsAdvice, - termsAccumulator.searchTermsPriority, - termsAccumulator.searchTermCoherences - ) - ) + .query(searchQuery) .humanQuery(query) - .quality(qualityLimits.qualityLimit) - .year(qualityLimits.year) - .size(qualityLimits.size) - .rank(qualityLimits.rank) + .quality(qualityLimit) + .year(year) + .size(size) + .rank(rank) .domains(domainIds) .queryLimits(limits) .searchSetIdentifier(params.identifier()) .rankingParams(ResultRankingParameters.sensibleDefaults()) - .queryStrategy(qualityLimits.queryStrategy); + .queryStrategy(queryStrategy); SearchSpecification specs = specsBuilder.build(); @@ -113,30 +172,52 @@ public class QueryFactory { return new ProcessedQuery(specs, searchTermsHuman, domain); } - private String normalizeDomainName(String str) { - return str.toLowerCase(); - } - - private List toHumanSearchTerms(Token t) { - if (t.type == TokenType.LITERAL_TERM) { - return Arrays.asList(t.displayStr.split("\\s+")); - } - else if (t.type == TokenType.QUOT_TERM) { - return Arrays.asList(t.displayStr.replace("\"", "").split("\\s+")); - - } - return Collections.emptyList(); - } - - private void analyzeSearchTerm(List problems, Token term) { - final String word = term.str; + private void analyzeSearchTerm(List problems, String str, String displayStr) { + final String word = str; if (word.length() < WordPatterns.MIN_WORD_LENGTH) { - problems.add("Search term \"" + term.displayStr + "\" too short"); + problems.add("Search term \"" + displayStr + "\" too short"); } if (!word.contains("_") && word.length() >= WordPatterns.MAX_WORD_LENGTH) { - problems.add("Search term \"" + term.displayStr + "\" too long"); + problems.add("Search term \"" + displayStr + "\" too long"); + } + } + private SpecificationLimit parseSpecificationLimit(String str) { + int startChar = str.charAt(0); + + int val = Integer.parseInt(str.substring(1)); + if (startChar == '=') { + return SpecificationLimit.equals(val); + } else if (startChar == '<') { + return SpecificationLimit.lessThan(val); + } else if (startChar == '>') { + return SpecificationLimit.greaterThan(val); + } else { + return SpecificationLimit.none(); } } + private QueryStrategy parseQueryStrategy(String str) { + return switch (str.toUpperCase()) { + case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; + case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; + case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; + case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; + case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; + case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; + case "SENTENCE" -> QueryStrategy.SENTENCE; + case "TOPIC" -> QueryStrategy.TOPIC; + default -> QueryStrategy.AUTO; + }; + } + + + private boolean anyPartIsStopWord(String[] parts) { + for (String part : parts) { + if (WordPatterns.isStopWord(part)) { + return true; + } + } + return false; + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java deleted file mode 100644 index 1b49bab3..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.functions.searchquery.svc; - -import nu.marginalia.api.searchquery.model.query.QueryParams; -import nu.marginalia.index.query.limit.QueryStrategy; -import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; - -public class QueryLimitsAccumulator implements TokenVisitor { - public SpecificationLimit qualityLimit; - public SpecificationLimit year; - public SpecificationLimit size; - public SpecificationLimit rank; - - public QueryStrategy queryStrategy = QueryStrategy.AUTO; - - public QueryLimitsAccumulator(QueryParams params) { - qualityLimit = params.quality(); - year = params.year(); - size = params.size(); - rank = params.rank(); - } - - private SpecificationLimit parseSpecificationLimit(String str) { - int startChar = str.charAt(0); - - int val = Integer.parseInt(str.substring(1)); - if (startChar == '=') { - return SpecificationLimit.equals(val); - } else if (startChar == '<') { - return SpecificationLimit.lessThan(val); - } else if (startChar == '>') { - return SpecificationLimit.greaterThan(val); - } else { - return SpecificationLimit.none(); - } - } - - private QueryStrategy parseQueryStrategy(String str) { - return switch (str.toUpperCase()) { - case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; - case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; - case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; - case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; - case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; - case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; - case "SENTENCE" -> QueryStrategy.SENTENCE; - case "TOPIC" -> QueryStrategy.TOPIC; - default -> QueryStrategy.AUTO; - }; - } - - @Override - public void onYearTerm(Token token) { - year = parseSpecificationLimit(token.str); - } - - @Override - public void onSizeTerm(Token token) { - size = parseSpecificationLimit(token.str); - } - - @Override - public void onRankTerm(Token token) { - rank = parseSpecificationLimit(token.str); - } - - @Override - public void onQualityTerm(Token token) { - qualityLimit = parseSpecificationLimit(token.str); - } - - @Override - public void onQsTerm(Token token) { - queryStrategy = parseQueryStrategy(token.str); - } - - - @Override - public void onLiteralTerm(Token token) {} - - @Override - public void onQuotTerm(Token token) {} - - @Override - public void onExcludeTerm(Token token) {} - - @Override - public void onPriorityTerm(Token token) {} - - @Override - public void onAdviceTerm(Token token) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java deleted file mode 100644 index cc3a7e56..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.functions.searchquery.svc; - -import nu.marginalia.api.searchquery.model.query.SearchQuery; -import nu.marginalia.language.WordPatterns; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** @see SearchQuery */ -public class QuerySearchTermsAccumulator implements TokenVisitor { - public List searchTermsExclude = new ArrayList<>(); - public List searchTermsInclude = new ArrayList<>(); - public List searchTermsAdvice = new ArrayList<>(); - public List searchTermsPriority = new ArrayList<>(); - public List> searchTermCoherences = new ArrayList<>(); - - public String domain; - - public QuerySearchTermsAccumulator(List parts) { - for (Token t : parts) { - t.visit(this); - } - - if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { - searchTermsInclude.addAll(searchTermsAdvice); - searchTermsAdvice.clear(); - } - - } - - @Override - public void onLiteralTerm(Token token) { - searchTermsInclude.add(token.str); - } - - @Override - public void onQuotTerm(Token token) { - String[] parts = token.str.split("_"); - - // HACK (2023-05-02 vlofgren) - // - // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being - // required in the query (which is a problem because they are not indexed). How to do this - // in a clean way is a bit of an open problem that may not get resolved until query-parsing is - // improved. - - if (parts.length > 1 && !anyPartIsStopWord(parts)) { - // Prefer that the actual n-gram is present - searchTermsAdvice.add(token.str); - - // Require that the terms appear in the same sentence - searchTermCoherences.add(Arrays.asList(parts)); - - // Require that each term exists in the document - // (needed for ranking) - searchTermsInclude.addAll(Arrays.asList(parts)); - } - else { - searchTermsInclude.add(token.str); - - } - } - - private boolean anyPartIsStopWord(String[] parts) { - for (String part : parts) { - if (WordPatterns.isStopWord(part)) { - return true; - } - } - return false; - } - - @Override - public void onExcludeTerm(Token token) { - searchTermsExclude.add(token.str); - } - - @Override - public void onPriorityTerm(Token token) { - searchTermsPriority.add(token.str); - } - - @Override - public void onAdviceTerm(Token token) { - searchTermsAdvice.add(token.str); - - if (token.str.toLowerCase().startsWith("site:")) { - domain = token.str.substring("site:".length()); - } - } - - @Override - public void onYearTerm(Token token) {} - @Override - public void onSizeTerm(Token token) {} - @Override - public void onRankTerm(Token token) {} - @Override - public void onQualityTerm(Token token) {} - @Override - public void onQsTerm(Token token) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java index 08bc428e..62dd2e0a 100644 --- a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java +++ b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java @@ -80,6 +80,15 @@ public class TransformList { iter.remove(); } } + else if (firstEntity.action == Action.NO_OP) { + if (secondEntry.action == Action.REPLACE) { + backingList.set(iter.nextIndex(), secondEntry.value); + } + else if (secondEntry.action == Action.REMOVE) { + iter.next(); + iter.remove(); + } + } } } From b7d9a7ae89d7d606bc219f429af06fd12cb7f5f9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 18:12:01 +0200 Subject: [PATCH 32/47] (ngrams) Remove the vestigial logic for capturing permutations of n-grams The change also reduces the object churn in NGramLexicon, as this is a very hot method in the converter. --- .../task/ExportSegmentationModelActor.java | 4 +- .../query_parser/QueryExpansion.java | 9 +- .../segmentation/NgramExporterMain.java | 46 --------- .../segmentation/NgramExtractorMain.java | 7 +- .../marginalia/segmentation/NgramLexicon.java | 96 +++++++++---------- .../segmentation/NgramLexiconTest.java | 18 +--- 6 files changed, 60 insertions(+), 120 deletions(-) delete mode 100644 code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java index 90baf009..98cf114e 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -21,6 +21,7 @@ public class ExportSegmentationModelActor extends RecordActorPrototype { private final Logger logger = LoggerFactory.getLogger(getClass()); public record Export(String zimFile) implements ActorStep {} + @Override public ActorStep transition(ActorStep self) throws Exception { return switch(self) { @@ -29,9 +30,8 @@ public class ExportSegmentationModelActor extends RecordActorPrototype { var storage = storageService.allocateStorage(FileStorageType.EXPORT, "segmentation-model", "Segmentation Model Export " + LocalDateTime.now()); Path countsFile = storage.asPath().resolve("ngram-counts.bin"); - Path permutationsFile = storage.asPath().resolve("ngram-permutations.bin"); - NgramExtractorMain.dumpCounts(Path.of(zimFile), countsFile, permutationsFile); + NgramExtractorMain.dumpCounts(Path.of(zimFile), countsFile); yield new End(); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 052516d8..9c9d81fa 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -112,10 +112,15 @@ public class QueryExpansion { // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { - for (var segment : lexicon.findSegments(length, words)) { + for (var segment : lexicon.findSegmentOffsets(length, words)) { + int start = segment.start(); int end = segment.start() + segment.length(); - var word = IntStream.range(start, end).mapToObj(nodes::get).map(QWord::word).collect(Collectors.joining("_")); + + var word = IntStream.range(start, end) + .mapToObj(nodes::get) + .map(QWord::word) + .collect(Collectors.joining("_")); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java deleted file mode 100644 index ee6d2cd5..00000000 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.segmentation; - -import nu.marginalia.LanguageModels; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Scanner; - -public class NgramExporterMain { - - public static void main(String... args) throws IOException { - trial(); - } - - static void trial() throws IOException { - NgramLexicon lexicon = new NgramLexicon( - LanguageModels.builder() - .segments(Path.of("/home/vlofgren/ngram-counts.bin")) - .build() - ); - - System.out.println("Loaded!"); - - var scanner = new Scanner(System.in); - for (;;) { - System.out.println("Enter a sentence: "); - String line = scanner.nextLine(); - System.out.println("."); - if (line == null) - break; - - String[] terms = BasicSentenceExtractor.getStemmedParts(line); - System.out.println("."); - - for (int i = 2; i< 8; i++) { - lexicon.findSegments(i, terms).forEach(p -> { - System.out.println(STR."\{Arrays.toString(p.project(terms))}: \{p.count()}"); - }); - } - - } - } - - -} diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 577aee6e..3f29c74c 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -115,8 +115,7 @@ public class NgramExtractorMain { } public static void dumpCounts(Path zimInputFile, - Path countsOutputFile, - Path permutationsOutputFile + Path countsOutputFile ) throws IOException, InterruptedException { ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); @@ -143,9 +142,6 @@ public class NgramExtractorMain { for (var hash : orderedHashes) { lexicon.incOrdered(hash); } - for (var hash : unorderedHashes) { - lexicon.addUnordered(hash); - } } }); @@ -153,7 +149,6 @@ public class NgramExtractorMain { } lexicon.saveCounts(countsOutputFile); - lexicon.savePermutations(permutationsOutputFile); } } diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index 91cee314..e7dc1017 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -21,10 +21,8 @@ import java.util.List; @Singleton public class NgramLexicon { private final Long2IntOpenCustomHashMap counts; - private final LongOpenHashSet permutations = new LongOpenHashSet(); private static final HasherGroup orderedHasher = HasherGroup.ordered(); - private static final HasherGroup unorderedHasher = HasherGroup.unordered(); @Inject public NgramLexicon(LanguageModels models) { @@ -48,16 +46,57 @@ public class NgramLexicon { } public List findSegmentsStrings(int minLength, int maxLength, String... parts) { - List segments = new ArrayList<>(); + List segments = new ArrayList<>(); for (int i = minLength; i <= maxLength; i++) { segments.addAll(findSegments(i, parts)); } - return segments.stream().map(seg -> seg.project(parts)).toList(); + return segments; } - public List findSegments(int length, String... parts) { + public List findSegments(int length, String... parts) { + // Don't look for ngrams longer than the sentence + if (parts.length < length) return List.of(); + + List positions = new ArrayList<>(); + + // Hash the parts + long[] hashes = new long[parts.length]; + for (int i = 0; i < hashes.length; i++) { + hashes[i] = HasherGroup.hash(parts[i]); + } + + long ordered = 0; + int i = 0; + + // Prepare by combining up to length hashes + for (; i < length; i++) { + ordered = orderedHasher.apply(ordered, hashes[i]); + } + + // Slide the window and look for matches + for (;; i++) { + int ct = counts.get(ordered); + + if (ct > 0) { + positions.add(Arrays.copyOfRange(parts, i - length, length)); + } + + if (i >= hashes.length) + break; + + // Remove the oldest hash and add the new one + ordered = orderedHasher.replace(ordered, + hashes[i], + hashes[i - length], + length); + } + + return positions; + } + + public List findSegmentOffsets(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); @@ -70,13 +109,11 @@ public class NgramLexicon { } long ordered = 0; - long unordered = 0; int i = 0; // Prepare by combining up to length hashes for (; i < length; i++) { ordered = orderedHasher.apply(ordered, hashes[i]); - unordered = unorderedHasher.apply(unordered, hashes[i]); } // Slide the window and look for matches @@ -84,10 +121,7 @@ public class NgramLexicon { int ct = counts.get(ordered); if (ct > 0) { - positions.add(new SentenceSegment(i - length, length, ct, PositionType.NGRAM)); - } - else if (permutations.contains(unordered)) { - positions.add(new SentenceSegment(i - length, length, 0, PositionType.PERMUTATION)); + positions.add(new SentenceSegment(i - length, length, ct)); } if (i >= hashes.length) @@ -98,10 +132,6 @@ public class NgramLexicon { hashes[i], hashes[i - length], length); - unordered = unorderedHasher.replace(unordered, - hashes[i], - hashes[i - length], - length); } return positions; @@ -110,20 +140,6 @@ public class NgramLexicon { public void incOrdered(long hashOrdered) { counts.addTo(hashOrdered, 1); } - public void addUnordered(long hashUnordered) { - permutations.add(hashUnordered); - } - - - public void loadPermutations(Path path) throws IOException { - try (var dis = new DataInputStream(Files.newInputStream(path))) { - long size = dis.readInt(); - - for (int i = 0; i < size; i++) { - permutations.add(dis.readLong()); - } - } - } public void saveCounts(Path file) throws IOException { try (var dos = new DataOutputStream(Files.newOutputStream(file, @@ -142,37 +158,17 @@ public class NgramLexicon { }); } } - public void savePermutations(Path file) throws IOException { - try (var dos = new DataOutputStream(Files.newOutputStream(file, - StandardOpenOption.CREATE, - StandardOpenOption.TRUNCATE_EXISTING, - StandardOpenOption.WRITE))) { - dos.writeInt(counts.size()); - permutations.forEach(v -> { - try { - dos.writeLong(v); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - } public void clear() { - permutations.clear(); counts.clear(); } - public record SentenceSegment(int start, int length, int count, PositionType type) { + public record SentenceSegment(int start, int length, int count) { public String[] project(String... parts) { return Arrays.copyOfRange(parts, start, start + length); } } - enum PositionType { - NGRAM, PERMUTATION - } - private static class KeyIsAlreadyHashStrategy implements LongHash.Strategy { @Override public int hashCode(long l) { diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index d5065959..351ce869 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -14,7 +14,6 @@ class NgramLexiconTest { void addNgram(String... ngram) { lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); - lexicon.addUnordered(HasherGroup.unordered().rollingHash(ngram)); } @Test @@ -26,25 +25,16 @@ class NgramLexiconTest { String[] sent = { "hello", "world", "rye", "bread" }; var segments = lexicon.findSegments(2, "hello", "world", "rye", "bread"); - assertEquals(3, segments.size()); + assertEquals(2, segments.size()); - for (int i = 0; i < 3; i++) { + for (int i = 0; i < 2; i++) { var segment = segments.get(i); switch (i) { case 0 -> { - assertArrayEquals(new String[]{"hello", "world"}, segment.project(sent)); - assertEquals(1, segment.count()); - assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + assertArrayEquals(new String[]{"hello", "world"}, segment); } case 1 -> { - assertArrayEquals(new String[]{"world", "rye"}, segment.project(sent)); - assertEquals(0, segment.count()); - assertEquals(NgramLexicon.PositionType.PERMUTATION, segment.type()); - } - case 2 -> { - assertArrayEquals(new String[]{"rye", "bread"}, segment.project(sent)); - assertEquals(1, segment.count()); - assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + assertArrayEquals(new String[]{"rye", "bread"}, segment); } } } From 65e3caf4028757c2869a97cd45fd59cee94ecb59 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 18:50:21 +0200 Subject: [PATCH 33/47] (index) Clean up the code --- .../marginalia/index/ReverseIndexReader.java | 56 ++++++++++++------- .../index/index/CombinedIndexReader.java | 1 + .../index/index/IndexQueryBuilderImpl.java | 13 +---- .../index/index/QueryBranchWalker.java | 2 +- .../marginalia/index/index/StatefulIndex.java | 39 +++++++------ .../index/query/IndexQueryBuilder.java | 9 +-- 6 files changed, 64 insertions(+), 56 deletions(-) diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java index e37de80d..72feb7fd 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java @@ -68,8 +68,12 @@ public class ReverseIndexReader { } - long wordOffset(long wordId) { - long idx = wordsBTreeReader.findEntry(wordId); + /** Calculate the offset of the word in the documents. + * If the return-value is negative, the term does not exist + * in the index. + */ + long wordOffset(long termId) { + long idx = wordsBTreeReader.findEntry(termId); if (idx < 0) return -1L; @@ -77,37 +81,43 @@ public class ReverseIndexReader { return words.get(wordsDataOffset + idx + 1); } - public EntrySource documents(long wordId) { + public EntrySource documents(long termId) { if (null == words) { logger.warn("Reverse index is not ready, dropping query"); return new EmptyEntrySource(); } - long offset = wordOffset(wordId); + long offset = wordOffset(termId); - if (offset < 0) return new EmptyEntrySource(); + if (offset < 0) // No documents + return new EmptyEntrySource(); - return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId); + return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, termId); } - public QueryFilterStepIf also(long wordId) { - long offset = wordOffset(wordId); + /** Create a filter step requiring the specified termId to exist in the documents */ + public QueryFilterStepIf also(long termId) { + long offset = wordOffset(termId); - if (offset < 0) return new QueryFilterNoPass(); + if (offset < 0) // No documents + return new QueryFilterNoPass(); - return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId); + return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId); } - public QueryFilterStepIf not(long wordId) { - long offset = wordOffset(wordId); + /** Create a filter step requiring the specified termId to be absent from the documents */ + public QueryFilterStepIf not(long termId) { + long offset = wordOffset(termId); - if (offset < 0) return new QueryFilterLetThrough(); + if (offset < 0) // No documents + return new QueryFilterLetThrough(); return new ReverseIndexRejectFilter(createReaderNew(offset)); } - public int numDocuments(long wordId) { - long offset = wordOffset(wordId); + /** Return the number of documents with the termId in the index */ + public int numDocuments(long termId) { + long offset = wordOffset(termId); if (offset < 0) return 0; @@ -115,15 +125,20 @@ public class ReverseIndexReader { return createReaderNew(offset).numEntries(); } + /** Create a BTreeReader for the document offset associated with a termId */ private BTreeReader createReaderNew(long offset) { - return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset); + return new BTreeReader( + documents, + ReverseIndexParameters.docsBTreeContext, + offset); } - public long[] getTermMeta(long wordId, long[] docIds) { - long offset = wordOffset(wordId); + public long[] getTermMeta(long termId, long[] docIds) { + long offset = wordOffset(termId); if (offset < 0) { - logger.debug("Missing offset for word {}", wordId); + // This is likely a bug in the code, but we can't throw an exception here + logger.debug("Missing offset for word {}", termId); return new long[docIds.length]; } @@ -136,10 +151,9 @@ public class ReverseIndexReader { private boolean isUniqueAndSorted(long[] ids) { if (ids.length == 0) return true; - long prev = ids[0]; for (int i = 1; i < ids.length; i++) { - if(ids[i] <= prev) + if(ids[i] <= ids[i-1]) return false; } diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 3846bad8..27a631f5 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -41,6 +41,7 @@ public class CombinedIndexReader { public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } + public QueryFilterStepIf hasWordPrio(long termId) { return reverseIndexPriorityReader.also(termId); } diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 33ca033e..0f63fdbc 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -36,7 +36,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoFull(long termId) { + public IndexQueryBuilder also(long termId) { if (alreadyConsideredTerms.add(termId)) { query.addInclusionFilter(reverseIndexFullReader.also(termId)); @@ -45,16 +45,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoPrio(long termId) { - - if (alreadyConsideredTerms.add(termId)) { - query.addInclusionFilter(reverseIndexPrioReader.also(termId)); - } - - return this; - } - - public IndexQueryBuilder notFull(long termId) { + public IndexQueryBuilder not(long termId) { query.addInclusionFilter(reverseIndexFullReader.not(termId)); diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java index 34b04f0a..ffaa5176 100644 --- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -75,7 +75,7 @@ public class QueryBranchWalker { // in practice only when an index doesn't contain all the search terms, so we can just // skip those paths if (!remainingPaths.isEmpty()) { - logger.info("Dropping: {}", remainingPaths); + logger.debug("Dropping: {}", remainingPaths); } return ret; diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 273da2d0..ae7b1353 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -125,59 +125,65 @@ public class StatefulIndex { // the term is missing from the index and can never be found paths.removeIf(containsAll(termPriority).negate()); - List helpers = QueryBranchWalker.create(termPriority, paths); + List walkers = QueryBranchWalker.create(termPriority, paths); - for (var helper : helpers) { + for (var walker : walkers) { for (var builder : List.of( - combinedIndexReader.findPriorityWord(helper.termId), - combinedIndexReader.findFullWord(helper.termId) + combinedIndexReader.findPriorityWord(walker.termId), + combinedIndexReader.findFullWord(walker.termId) )) { queryHeads.add(builder); - if (helper.atEnd()) - continue; + if (walker.atEnd()) + continue; // Single term search query + // Add filter steps for the remaining combinations of terms List filterSteps = new ArrayList<>(); - for (var step : helper.next()) { + for (var step : walker.next()) { filterSteps.add(createFilter(step, 0)); } builder.addInclusionFilterAny(filterSteps); } } - List ret = new ArrayList<>(10); // Add additional conditions to the query heads for (var query : queryHeads) { // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing for (long term : terms.advice()) { - query = query.alsoFull(term); + query = query.also(term); } for (long term : terms.excludes()) { - query = query.notFull(term); + query = query.not(term); } // Run these filter steps last, as they'll worst-case cause as many page faults as there are // items in the buffer - ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); + query.addInclusionFilter(combinedIndexReader.filterForParams(params)); } - - return ret; + return queryHeads + .stream() + .map(IndexQueryBuilder::build) + .toList(); } /** Recursively create a filter step based on the QBW and its children */ private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) { + + // Create a filter for the current termId final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth); var childSteps = walker.next(); - - if (childSteps.isEmpty()) + if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition return ownFilterCondition; + // If there are children, we append the filter conditions for each child as an anyOf condition + // to the current filter condition + List combinedFilters = new ArrayList<>(); for (var step : childSteps) { @@ -186,6 +192,7 @@ public class StatefulIndex { combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter)); } + // Flatten the filter conditions if there's only one branch if (combinedFilters.size() == 1) return combinedFilters.getFirst(); else @@ -196,7 +203,7 @@ public class StatefulIndex { private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) { if (depth < 2) { // At shallow depths we prioritize terms that appear in the priority index, - // to increase the odds we find "good" results before the sand runs out + // to increase the odds we find "good" results before the execution timer runs out return new QueryFilterAnyOf( combinedIndexReader.hasWordPrio(walker.termId), combinedIndexReader.hasWordFull(walker.termId) diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java index 74ebdea1..855309fa 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -11,16 +11,11 @@ import java.util.List; public interface IndexQueryBuilder { /** Filters documents that also contain termId, within the full index. */ - IndexQueryBuilder alsoFull(long termId); - - /** - * Filters documents that also contain the termId, within the priority index. - */ - IndexQueryBuilder alsoPrio(long termIds); + IndexQueryBuilder also(long termId); /** Excludes documents that contain termId, within the full index */ - IndexQueryBuilder notFull(long termId); + IndexQueryBuilder not(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); IndexQueryBuilder addInclusionFilterAny(List filterStep); From bb6b51ad91be7bc8164280469cef775f8eba557a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 10:13:25 +0200 Subject: [PATCH 34/47] (ngram) Fix index range in NgramLexicon to an avoid exception --- .../java/nu/marginalia/segmentation/NgramLexicon.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index e7dc1017..7a6beeb8 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -80,7 +80,7 @@ public class NgramLexicon { int ct = counts.get(ordered); if (ct > 0) { - positions.add(Arrays.copyOfRange(parts, i - length, length)); + positions.add(Arrays.copyOfRange(parts, i - length, i)); } if (i >= hashes.length) From 864d6c28e743bb9884e6215c615cb2983cf22d0a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:44:14 +0200 Subject: [PATCH 35/47] (segmentation) Pick best segmentation using |s|^|s|-style normalization This is better than doing all segmentations possible at the same time. --- .../query_parser/QueryExpansion.java | 72 ++++++++++++++++--- .../query/svc/QueryFactoryTest.java | 17 +++++ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 9c9d81fa..80d8c8f3 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -9,8 +9,7 @@ import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; -import java.util.ArrayList; -import java.util.List; +import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -110,21 +109,72 @@ public class QueryExpansion { String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); - // Look for known segments within the query + // Grab all segments + + List allSegments = new ArrayList<>(); for (int length = 2; length < Math.min(10, words.length); length++) { - for (var segment : lexicon.findSegmentOffsets(length, words)) { + allSegments.addAll(lexicon.findSegmentOffsets(length, words)); + } + allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); - int start = segment.start(); - int end = segment.start() + segment.length(); + if (allSegments.isEmpty()) { + return; + } - var word = IntStream.range(start, end) - .mapToObj(nodes::get) - .map(QWord::word) - .collect(Collectors.joining("_")); + Set bestSegmentation = + findBestSegmentation(allSegments); - graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); + for (var segment : bestSegmentation) { + + int start = segment.start(); + int end = segment.start() + segment.length(); + + var word = IntStream.range(start, end) + .mapToObj(nodes::get) + .map(QWord::word) + .collect(Collectors.joining("_")); + + System.out.println(word); + + graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); + } + + } + + private Set findBestSegmentation(List allSegments) { + Set bestSet = Set.of(); + double bestScore = Double.MIN_VALUE; + + for (int i = 0; i < allSegments.size(); i++) { + Set parts = new HashSet<>(); + parts.add(allSegments.get(i)); + + outer: + for (int j = i+1; j < allSegments.size(); j++) { + var candidate = allSegments.get(j); + for (var part : parts) { + if (part.overlaps(candidate)) { + continue outer; + } + } + parts.add(candidate); + } + + double score = 0.; + for (var part : parts) { + // |s|^|s|-normalization per M Hagen et al + double normFactor = Math.pow(part.count(), part.count()); + + score += normFactor * part.count(); + } + + if (bestScore < score) { + bestScore = score; + bestSet = parts; } } + + return bestSet; } public interface ExpansionStrategy { diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 132944c4..622130b7 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -16,6 +16,8 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -52,6 +54,21 @@ public class QueryFactoryTest { ResultRankingParameters.TemporalBias.NONE)).specs; } + + @Test + void qsec10() { + try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) { + lines.limit(1000).forEach(line -> { + String[] parts = line.split("\t"); + if (parts.length == 2) { + System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery); + } + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + @Test public void testParseNoSpecials() { var year = parseAndGetSpecs("in the year 2000").year; From 6a670435370c66db6f649e7ff118023093d33943 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:45:06 +0200 Subject: [PATCH 36/47] (ngram) Clean up ngram lexicon code This is both an optimization that removes some GC churn, as well as a clean-up of the code that removes references to outdated concepts. --- .../marginalia/segmentation/NgramLexicon.java | 82 +++++++++---------- .../segmentation/NgramLexiconTest.java | 5 +- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index 7a6beeb8..5a82ab3e 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.LanguageModels; import java.io.BufferedInputStream; @@ -45,55 +44,54 @@ public class NgramLexicon { counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); } - public List findSegmentsStrings(int minLength, int maxLength, String... parts) { + public List findSegmentsStrings(int minLength, + int maxLength, + String... parts) + { List segments = new ArrayList<>(); - for (int i = minLength; i <= maxLength; i++) { - segments.addAll(findSegments(i, parts)); - } - - return segments; - } - - public List findSegments(int length, String... parts) { - // Don't look for ngrams longer than the sentence - if (parts.length < length) return List.of(); - - List positions = new ArrayList<>(); - // Hash the parts long[] hashes = new long[parts.length]; for (int i = 0; i < hashes.length; i++) { hashes[i] = HasherGroup.hash(parts[i]); } - long ordered = 0; + for (int i = minLength; i <= maxLength; i++) { + findSegments(segments, i, parts, hashes); + } + + return segments; + } + + public void findSegments(List positions, + int length, + String[] parts, + long[] hashes) + { + // Don't look for ngrams longer than the sentence + if (parts.length < length) return; + + long hash = 0; int i = 0; // Prepare by combining up to length hashes for (; i < length; i++) { - ordered = orderedHasher.apply(ordered, hashes[i]); + hash = orderedHasher.apply(hash, hashes[i]); } // Slide the window and look for matches - for (;; i++) { - int ct = counts.get(ordered); - - if (ct > 0) { + for (;;) { + if (counts.get(hash) > 0) { positions.add(Arrays.copyOfRange(parts, i - length, i)); } - if (i >= hashes.length) + if (i < hashes.length) { + hash = orderedHasher.replace(hash, hashes[i], hashes[i - length], length); + i++; + } else { break; - - // Remove the oldest hash and add the new one - ordered = orderedHasher.replace(ordered, - hashes[i], - hashes[i - length], - length); + } } - - return positions; } public List findSegmentOffsets(int length, String... parts) { @@ -108,30 +106,28 @@ public class NgramLexicon { hashes[i] = HasherGroup.hash(parts[i]); } - long ordered = 0; + long hash = 0; int i = 0; // Prepare by combining up to length hashes for (; i < length; i++) { - ordered = orderedHasher.apply(ordered, hashes[i]); + hash = orderedHasher.apply(hash, hashes[i]); } // Slide the window and look for matches - for (;; i++) { - int ct = counts.get(ordered); + for (;;) { + int ct = counts.get(hash); if (ct > 0) { positions.add(new SentenceSegment(i - length, length, ct)); } - if (i >= hashes.length) + if (i < hashes.length) { + hash = orderedHasher.replace(hash, hashes[i], hashes[i - length], length); + i++; + } else { break; - - // Remove the oldest hash and add the new one - ordered = orderedHasher.replace(ordered, - hashes[i], - hashes[i - length], - length); + } } return positions; @@ -167,6 +163,10 @@ public class NgramLexicon { public String[] project(String... parts) { return Arrays.copyOfRange(parts, start, start + length); } + + public boolean overlaps(SentenceSegment other) { + return start < other.start + other.length && start + length > other.start; + } } private static class KeyIsAlreadyHashStrategy implements LongHash.Strategy { diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index 351ce869..f5068d07 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -3,6 +3,8 @@ package nu.marginalia.segmentation; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.junit.jupiter.api.Assertions.*; class NgramLexiconTest { @@ -22,8 +24,7 @@ class NgramLexiconTest { addNgram("rye", "bread"); addNgram("rye", "world"); - String[] sent = { "hello", "world", "rye", "bread" }; - var segments = lexicon.findSegments(2, "hello", "world", "rye", "bread"); + List segments = lexicon.findSegmentsStrings(2, 2, "hello", "world", "rye", "bread"); assertEquals(2, segments.size()); From ad4810d9918458673ed25ba57bb29893ed1f2939 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:45:26 +0200 Subject: [PATCH 37/47] (query, minor) Remove debug statement --- .../nu/marginalia/functions/searchquery/svc/QueryFactory.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 26af1bf4..15596d5c 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -68,8 +68,6 @@ public class QueryFactory { String domain = null; - System.out.println(basicQuery); - for (QueryToken t : basicQuery) { switch (t) { case QueryToken.QuotTerm(String str, String displayStr) -> { From d729c400e5c79913e5a7e716e3235815813a065e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:52:55 +0200 Subject: [PATCH 38/47] (query, minor) Remove debug statement --- .../functions/searchquery/query_parser/QueryExpansion.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 80d8c8f3..efdaf328 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -134,8 +134,6 @@ public class QueryExpansion { .map(QWord::word) .collect(Collectors.joining("_")); - System.out.println(word); - graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } From 8a81a480a19cbdec6150e53b9a90e147fa0a3ae9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 18:08:31 +0200 Subject: [PATCH 39/47] (ngram) Only extract frequencies of title words, but use the body to increment the counters... The sign of the counter is used to indicate whether a term has appeared as title. Until it's seen in the title, it's provisionally saved as a negative count. --- .../segmentation/NgramExtractorMain.java | 69 ++++++++----------- .../marginalia/segmentation/NgramLexicon.java | 25 +++++-- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 3f29c74c..02e2a881 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -8,10 +8,7 @@ import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMReader; import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Executors; @@ -22,14 +19,20 @@ public class NgramExtractorMain { public static void main(String... args) { } - private static List getNgramTerms(String title, Document document) { + private static List getNgramTitleTerms(String title) { List terms = new ArrayList<>(); // Add the title - if (title.contains(" ")) { + if (title.contains(" ")) { // Only add multi-word titles since we're chasing ngrams terms.add(title.toLowerCase()); } + return cleanTerms(terms); + } + + private static List getNgramBodyTerms(Document document) { + List terms = new ArrayList<>(); + // Grab all internal links document.select("a[href]").forEach(e -> { var href = e.attr("href"); @@ -54,6 +57,10 @@ public class NgramExtractorMain { terms.add(text); }); + return cleanTerms(terms); + } + + private static List cleanTerms(List terms) { // Trim the discovered terms terms.replaceAll(s -> { @@ -85,35 +92,6 @@ public class NgramExtractorMain { return terms; } - public static void dumpNgramsList( - Path zimFile, - Path ngramFile - ) throws IOException, InterruptedException { - ZIMReader reader = new ZIMReader(new ZIMFile(zimFile.toString())); - - PrintWriter printWriter = new PrintWriter(Files.newOutputStream(ngramFile, - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)); - - LongOpenHashSet known = new LongOpenHashSet(); - - try (var executor = Executors.newWorkStealingPool()) { - reader.forEachArticles((title, body) -> { - executor.submit(() -> { - var terms = getNgramTerms(title, Jsoup.parse(body)); - synchronized (known) { - for (String term : terms) { - if (known.add(hash.hashNearlyASCII(term))) { - printWriter.println(term); - } - } - } - }); - - }, p -> true); - } - printWriter.close(); - } - public static void dumpCounts(Path zimInputFile, Path countsOutputFile ) throws IOException, InterruptedException @@ -123,24 +101,31 @@ public class NgramExtractorMain { NgramLexicon lexicon = new NgramLexicon(); var orderedHasher = HasherGroup.ordered(); - var unorderedHasher = HasherGroup.unordered(); try (var executor = Executors.newWorkStealingPool()) { reader.forEachArticles((title, body) -> { executor.submit(() -> { - LongArrayList orderedHashes = new LongArrayList(); - LongArrayList unorderedHashes = new LongArrayList(); + LongArrayList orderedHashesTitle = new LongArrayList(); + LongArrayList orderedHashesBody = new LongArrayList(); - for (var sent : getNgramTerms(title, Jsoup.parse(body))) { + for (var sent : getNgramTitleTerms(title)) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); - orderedHashes.add(orderedHasher.rollingHash(terms)); - unorderedHashes.add(unorderedHasher.rollingHash(terms)); + orderedHashesTitle.add(orderedHasher.rollingHash(terms)); + } + + for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + + orderedHashesBody.add(orderedHasher.rollingHash(terms)); } synchronized (lexicon) { - for (var hash : orderedHashes) { - lexicon.incOrdered(hash); + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); + } + for (var hash : orderedHashesBody) { + lexicon.incOrderedBody(hash); } } }); diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index 5a82ab3e..e831e25b 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -42,6 +42,7 @@ public class NgramLexicon { public NgramLexicon() { counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); + counts.defaultReturnValue(0); } public List findSegmentsStrings(int minLength, @@ -133,8 +134,22 @@ public class NgramLexicon { return positions; } - public void incOrdered(long hashOrdered) { - counts.addTo(hashOrdered, 1); + public void incOrderedTitle(long hashOrdered) { + int value = counts.get(hashOrdered); + + if (value < 0) value = -value + 1; + else value ++; + + counts.put(hashOrdered, value); + } + + public void incOrderedBody(long hashOrdered) { + int value = counts.get(hashOrdered); + + if (value <= 0) value --; + else value ++; + + counts.put(hashOrdered, value); } public void saveCounts(Path file) throws IOException { @@ -146,8 +161,10 @@ public class NgramLexicon { counts.forEach((k, v) -> { try { - dos.writeLong(k); - dos.writeInt(v); + if (v > 0) { + dos.writeLong(k); + dos.writeInt(v); + } } catch (IOException e) { throw new RuntimeException(e); } From f06499213749bedf8cf11347c4dc62607e28c1e0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 17:07:23 +0200 Subject: [PATCH 40/47] (ngram) Use simple blocking pool instead of FJP; split on underscores in article names. --- .../term-frequency-dict/build.gradle | 1 + .../segmentation/NgramExtractorMain.java | 63 ++++++++++--------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 67fb44ae..3a9a4d8d 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:array') + implementation project(':code:libraries:blocking-thread-pool') implementation libs.bundles.slf4j implementation libs.notnull diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 02e2a881..270117da 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -2,6 +2,7 @@ package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.util.SimpleBlockingThreadPool; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.openzim.ZIMTypes.ZIMFile; @@ -11,12 +12,12 @@ import java.io.IOException; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; public class NgramExtractorMain { - static MurmurHash3_128 hash = new MurmurHash3_128(); - - public static void main(String... args) { + public static void main(String... args) throws IOException, InterruptedException { + dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"), + Path.of("/tmp/ngram-counts.bin")); } private static List getNgramTitleTerms(String title) { @@ -102,36 +103,42 @@ public class NgramExtractorMain { var orderedHasher = HasherGroup.ordered(); - try (var executor = Executors.newWorkStealingPool()) { - reader.forEachArticles((title, body) -> { - executor.submit(() -> { - LongArrayList orderedHashesTitle = new LongArrayList(); - LongArrayList orderedHashesBody = new LongArrayList(); + var pool = new SimpleBlockingThreadPool("ngram-extractor", + Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32), + 32 + ); - for (var sent : getNgramTitleTerms(title)) { - String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + reader.forEachArticles((title, body) -> { + pool.submitQuietly(() -> { + LongArrayList orderedHashesTitle = new LongArrayList(); + LongArrayList orderedHashesBody = new LongArrayList(); - orderedHashesTitle.add(orderedHasher.rollingHash(terms)); + String normalizedTitle = title.replace('_', ' '); + + for (var sent : getNgramTitleTerms(normalizedTitle)) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + orderedHashesTitle.add(orderedHasher.rollingHash(terms)); + } + + for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + orderedHashesBody.add(orderedHasher.rollingHash(terms)); + } + + synchronized (lexicon) { + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); } - - for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { - String[] terms = BasicSentenceExtractor.getStemmedParts(sent); - - orderedHashesBody.add(orderedHasher.rollingHash(terms)); + for (var hash : orderedHashesBody) { + lexicon.incOrderedBody(hash); } + } + }); - synchronized (lexicon) { - for (var hash : orderedHashesTitle) { - lexicon.incOrderedTitle(hash); - } - for (var hash : orderedHashesBody) { - lexicon.incOrderedBody(hash); - } - } - }); + }, p -> true); - }, p -> true); - } + pool.shutDown(); + pool.awaitTermination(10, TimeUnit.DAYS); lexicon.saveCounts(countsOutputFile); } From 1329d4abd86f2e6475cbea05a8083c3dd0b41674 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 17:51:02 +0200 Subject: [PATCH 41/47] (ngram) Correct size value in ngram lexicon generation, trim the terms better --- .../segmentation/NgramExtractorMain.java | 17 +++++++++---- .../marginalia/segmentation/NgramLexicon.java | 24 ++++++++++++++----- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 270117da..f6ba5b08 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -1,7 +1,6 @@ package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; -import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.util.SimpleBlockingThreadPool; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit; public class NgramExtractorMain { public static void main(String... args) throws IOException, InterruptedException { - dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"), - Path.of("/tmp/ngram-counts.bin")); } private static List getNgramTitleTerms(String title) { @@ -64,7 +61,6 @@ public class NgramExtractorMain { private static List cleanTerms(List terms) { // Trim the discovered terms terms.replaceAll(s -> { - // Remove trailing parentheses and their contents if (s.endsWith(")")) { int idx = s.lastIndexOf('('); @@ -73,6 +69,10 @@ public class NgramExtractorMain { } } + return s; + }); + + terms.replaceAll(s -> { // Remove leading "list of " if (s.startsWith("list of ")) { return s.substring("list of ".length()); @@ -81,6 +81,15 @@ public class NgramExtractorMain { return s; }); + terms.replaceAll(s -> { + // Remove trailing punctuation + if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) { + return s.substring(0, s.length() - 1); + } + + return s; + }); + // Remove terms that are too short or too long terms.removeIf(s -> { if (!s.contains(" ")) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index e831e25b..9b59a84f 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -21,6 +21,7 @@ import java.util.List; public class NgramLexicon { private final Long2IntOpenCustomHashMap counts; + private int size; private static final HasherGroup orderedHasher = HasherGroup.ordered(); @Inject @@ -31,9 +32,15 @@ public class NgramLexicon { (int) size, new KeyIsAlreadyHashStrategy() ); + counts.defaultReturnValue(0); - for (int i = 0; i < size; i++) { - counts.put(dis.readLong(), dis.readInt()); + try { + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } + } + catch (IOException ex) { + ex.printStackTrace(); } } catch (IOException e) { throw new RuntimeException(e); @@ -137,8 +144,12 @@ public class NgramLexicon { public void incOrderedTitle(long hashOrdered) { int value = counts.get(hashOrdered); - if (value < 0) value = -value + 1; - else value ++; + if (value <= 0) { + size ++; + value = -value; + } + + value ++; counts.put(hashOrdered, value); } @@ -147,7 +158,7 @@ public class NgramLexicon { int value = counts.get(hashOrdered); if (value <= 0) value --; - else value ++; + else value++; counts.put(hashOrdered, value); } @@ -157,7 +168,8 @@ public class NgramLexicon { StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE))) { - dos.writeInt(counts.size()); + + dos.writeInt(size); counts.forEach((k, v) -> { try { From fda1c05164fd07d2fe6b4ac2dc66057e217aea70 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 18:05:30 +0200 Subject: [PATCH 42/47] (ngram) Correct |s|^|s|-normalization to use length and not count --- .../functions/searchquery/query_parser/QueryExpansion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index efdaf328..d4e324fa 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -161,7 +161,7 @@ public class QueryExpansion { double score = 0.; for (var part : parts) { // |s|^|s|-normalization per M Hagen et al - double normFactor = Math.pow(part.count(), part.count()); + double normFactor = Math.pow(part.length(), part.length()); score += normFactor * part.count(); } From be55f3f937dc0c86e22b51fe03ff44514f0061aa Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 19:33:47 +0200 Subject: [PATCH 43/47] (zim) Fix title extractor --- .../java/org/openzim/ZIMTypes/ZIMReader.java | 23 ++----------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index e2fcaf6e..e9b5cf47 100644 --- a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -275,9 +275,7 @@ public class ZIMReader { } - - // Gives the minimum required information needed for the given articleName - public DirectoryEntry forEachTitles(Consumer aeConsumer, Consumer reConsumer) + public DirectoryEntry forEachTitles(Consumer titleConsumer) throws IOException { int numberOfArticles = mFile.getArticleCount(); @@ -287,26 +285,9 @@ public class ZIMReader { System.err.println(numberOfArticles); long start = System.currentTimeMillis(); - Map> data = new TreeMap<>(); - - System.err.println("Indexing"); - for (long i = beg; i < end; i+=4) { var entry = getDirectoryInfoAtTitlePosition(i); - - if (((i-beg)%100_000) == 0) { - System.err.printf("%f%%\n", ((i-beg) * 100.) / (end-beg)); - } - - if (entry.mimeType == targetMime && entry instanceof ArticleEntry) { - aeConsumer.accept((ArticleEntry) entry); - } - else if (entry.mimeType == 65535 && entry instanceof RedirectEntry) { - - reConsumer.accept((RedirectEntry) entry); - - } - + titleConsumer.accept(entry.title); } return null; From 52f0c0d33649ea848f2536320ddf2a4a6b27727d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 19:34:16 +0200 Subject: [PATCH 44/47] (ngram) Grab titles separately when extracting ngrams from wiki data --- .../segmentation/NgramExtractorMain.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index f6ba5b08..b0eb6916 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -117,10 +117,9 @@ public class NgramExtractorMain { 32 ); - reader.forEachArticles((title, body) -> { + reader.forEachTitles((title) -> { pool.submitQuietly(() -> { LongArrayList orderedHashesTitle = new LongArrayList(); - LongArrayList orderedHashesBody = new LongArrayList(); String normalizedTitle = title.replace('_', ' '); @@ -128,6 +127,18 @@ public class NgramExtractorMain { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); orderedHashesTitle.add(orderedHasher.rollingHash(terms)); } + synchronized (lexicon) { + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); + } + } + }); + + }); + + reader.forEachArticles((title, body) -> { + pool.submitQuietly(() -> { + LongArrayList orderedHashesBody = new LongArrayList(); for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); @@ -135,9 +146,6 @@ public class NgramExtractorMain { } synchronized (lexicon) { - for (var hash : orderedHashesTitle) { - lexicon.incOrderedTitle(hash); - } for (var hash : orderedHashesBody) { lexicon.incOrderedBody(hash); } From b6d365bacd7715b1744a7c74ff082a0d3f8e64ee Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Apr 2024 16:04:07 +0200 Subject: [PATCH 45/47] (index) Clean up data model The change set cleans up the data model for the term-level data. This used to contain a bunch of fields with document-level metadata. This data-duplication means a larger memory footprint and worse memory locality. The ranking code is also modified to not accept SearchResultKeywordScores, but rather CompiledQueryLong and CqDataInts containing only the term metadata and the frequency information needed for ranking. This is again an effort to improve memory locality. --- .../nu/marginalia/model/idx/WordFlags.java | 5 + .../api/searchquery/QueryProtobufCodec.java | 6 +- .../model/compiled/CompiledQuery.java | 4 + .../model/compiled/CompiledQueryInt.java | 44 ++++++ .../model/compiled/CompiledQueryLong.java | 8 +- .../searchquery/model/compiled/CqData.java | 11 +- .../searchquery/model/compiled/CqDataInt.java | 31 +++++ .../aggregate/CompiledQueryAggregates.java | 17 ++- .../aggregate/CqBooleanAggregate.java | 6 + .../aggregate/CqDoubleSumOperator.java | 6 + .../aggregate/CqIntMaxMinOperator.java | 6 + .../aggregate/CqLongBitmaskOperator.java | 5 + .../aggregate/CqPositionsOperator.java | 6 + .../model/results/ResultRankingContext.java | 30 ++--- .../model/results/SearchResultItem.java | 11 +- .../results/SearchResultKeywordScore.java | 39 +----- .../api/src/main/protobuf/query-api.proto | 8 +- .../nu/marginalia/index/IndexGrpcService.java | 27 ++-- .../results/IndexResultValuationContext.java | 58 ++++---- .../results/IndexResultValuatorService.java | 38 ++++-- .../ranking/results/ResultValuator.java | 32 ++--- .../ranking/results/factors/Bm25Factor.java | 113 ---------------- .../results/factors/Bm25FullGraphVisitor.java | 81 +++++++++++ .../results/factors/Bm25PrioGraphVisitor.java | 127 ++++++++++++++++++ .../results/factors/TermCoherenceFactor.java | 8 +- ...IndexQueryServiceIntegrationSmokeTest.java | 4 +- .../IndexResultDomainDeduplicatorTest.java | 2 +- .../ranking/results/ResultValuatorTest.java | 49 +++---- .../factors/TermCoherenceFactorTest.java | 19 ++- .../segmentation/NgramLexiconTest.java | 2 +- .../search/model/ClusteredUrlDetails.java | 2 +- 31 files changed, 520 insertions(+), 285 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java delete mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java create mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java create mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index dc627715..db54df77 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -50,6 +50,10 @@ public enum WordFlags { return (asBit() & value) > 0; } + public boolean isAbsent(long value) { + return (asBit() & value) == 0; + } + public static EnumSet decode(long encodedValue) { EnumSet ret = EnumSet.noneOf(WordFlags.class); @@ -61,4 +65,5 @@ public enum WordFlags { return ret; } + } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index b705917e..5a43df1b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -134,6 +134,8 @@ public class QueryProtobufCodec { return new SearchResultItem( rawItem.getCombinedId(), + rawItem.getEncodedDocMetadata(), + rawItem.getHtmlFeatures(), keywordScores, rawItem.getResultsFromDomain(), Double.NaN // Not set @@ -144,9 +146,7 @@ public class QueryProtobufCodec { return new SearchResultKeywordScore( keywordScores.getKeyword(), -1, // termId is internal to index service - keywordScores.getEncodedWordMetadata(), - keywordScores.getEncodedDocMetadata(), - keywordScores.getHtmlFeatures() + keywordScores.getEncodedWordMetadata() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java index 3ae850a3..356a1d86 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -46,6 +46,10 @@ public class CompiledQuery implements Iterable { return new CompiledQueryLong(root, data.mapToLong(mapper)); } + public CompiledQueryLong mapToInt(ToIntFunction mapper) { + return new CompiledQueryLong(root, data.mapToInt(mapper)); + } + public CqExpression root() { return root; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java new file mode 100644 index 00000000..9e26c35c --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java @@ -0,0 +1,44 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.stream.IntStream; + + +/** A compiled index service query */ +public class CompiledQueryInt { + private final CqExpression root; + private final CqDataInt data; + + public CompiledQueryInt(CqExpression root, CqDataInt data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public IntStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + public int[] copyData() { + return data.copyData(); + } + + public boolean isEmpty() { + return data.size() == 0; + } + + public int size() { + return data.size(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java index 94fa0e8b..718aaca7 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -9,8 +9,8 @@ import java.util.stream.LongStream; /** A compiled index service query */ public class CompiledQueryLong implements Iterable { - private final CqExpression root; - private final CqDataLong data; + public final CqExpression root; + public final CqDataLong data; public CompiledQueryLong(CqExpression root, CqDataLong data) { this.root = root; @@ -47,4 +47,8 @@ public class CompiledQueryLong implements Iterable { public boolean isEmpty() { return data.size() == 0; } + + public int size() { + return data.size(); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java index b1565dc0..145f3f0f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -3,7 +3,7 @@ package nu.marginalia.api.searchquery.model.compiled; import java.lang.reflect.Array; import java.util.Arrays; import java.util.function.Function; -import java.util.function.ToDoubleFunction; +import java.util.function.ToIntFunction; import java.util.function.ToLongFunction; import java.util.stream.Stream; @@ -33,6 +33,15 @@ public class CqData { return new CqDataLong(newData); } + public CqDataLong mapToInt(ToIntFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsInt((T) data[i]); + } + + return new CqDataLong(newData); + } + public T get(int i) { return data[i]; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java new file mode 100644 index 00000000..24991686 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java @@ -0,0 +1,31 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.IntStream; + +public class CqDataInt { + private final int[] data; + + public CqDataInt(int[] data) { + this.data = data; + } + + public int get(int i) { + return data[i]; + } + public int get(CqExpression.Word w) { + return data[w.idx()]; + } + + public IntStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } + + public int[] copyData() { + return Arrays.copyOf(data, data.length); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 0ab0647d..7e8ca8ec 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -17,6 +17,9 @@ public class CompiledQueryAggregates { static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { return query.root.visit(new CqBooleanAggregate(query, predicate)); } + static public boolean booleanAggregate(CompiledQueryLong query, LongPredicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, @@ -25,13 +28,20 @@ public class CompiledQueryAggregates { public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { return query.root.visit(new CqLongBitmaskOperator(query, operator)); } - + public static long longBitmaskAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); } + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + /** Apply the operator to each leaf node, and then return the highest sum of values possible * through each branch in the compiled query. * @@ -49,4 +59,9 @@ public class CompiledQueryAggregates { public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { return query.root().visit(new CqPositionsOperator(query, operator)); } + + /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ + public static LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root().visit(new CqPositionsOperator(query, operator)); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java index 05ebf4c7..2a87ec79 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntPredicate; +import java.util.function.LongPredicate; import java.util.function.Predicate; public class CqBooleanAggregate implements CqExpression.BoolVisitor { @@ -15,6 +17,10 @@ public class CqBooleanAggregate implements CqExpression.BoolVisitor { this.predicate = idx -> objPred.test(query.at(idx)); } + public CqBooleanAggregate(CompiledQueryLong query, LongPredicate longPredicate) { + this.predicate = idx -> longPredicate.test(query.at(idx)); + } + @Override public boolean onAnd(List parts) { for (var part : parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java index 23d1904e..082de29e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToDoubleFunction; +import java.util.function.LongToDoubleFunction; import java.util.function.ToDoubleFunction; public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { @@ -15,6 +17,10 @@ public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { this.operator = idx -> operator.applyAsDouble(query.at(idx)); } + public CqDoubleSumOperator(IntToDoubleFunction operator) { + this.operator = operator; + } + @Override public double onAnd(List parts) { double value = 0; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java index b3ec86bb..621dff73 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntUnaryOperator; +import java.util.function.LongToIntFunction; import java.util.function.ToIntFunction; public class CqIntMaxMinOperator implements CqExpression.IntVisitor { @@ -16,6 +18,10 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor { this.operator = idx -> operator.applyAsInt(query.at(idx)); } + public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + @Override public int onAnd(List parts) { int value = parts.getFirst().visit(this); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java index d9a4804b..b64029c1 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqLongBitmaskOperator implements CqExpression.LongVisitor { @@ -14,6 +16,9 @@ public class CqLongBitmaskOperator implements CqExpression.LongVisitor { public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { this.operator = idx-> operator.applyAsLong(query.at(idx)); } + public CqLongBitmaskOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } @Override public long onAnd(List parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java index 19db2d4b..715c4cb2 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java @@ -4,10 +4,12 @@ import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqPositionsOperator implements CqExpression.ObjectVisitor { @@ -17,6 +19,10 @@ public class CqPositionsOperator implements CqExpression.ObjectVisitor this.operator = idx -> operator.applyAsLong(query.at(idx)); } + public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx -> operator.applyAsLong(query.at(idx)); + } + @Override public LongSet onAnd(List parts) { LongSet ret = new LongArraySet(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index f0ad172f..9052345a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -1,38 +1,34 @@ package nu.marginalia.api.searchquery.model.results; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import lombok.ToString; - -import java.util.Map; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; @ToString public class ResultRankingContext { private final int docCount; public final ResultRankingParameters params; - private final Object2IntOpenHashMap fullCounts = new Object2IntOpenHashMap<>(10, 0.5f); - private final Object2IntOpenHashMap priorityCounts = new Object2IntOpenHashMap<>(10, 0.5f); + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt fullCounts; + + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt priorityCounts; public ResultRankingContext(int docCount, ResultRankingParameters params, - Map fullCounts, - Map prioCounts - ) { + CqDataInt fullCounts, + CqDataInt prioCounts) + { this.docCount = docCount; this.params = params; - this.fullCounts.putAll(fullCounts); - this.priorityCounts.putAll(prioCounts); + this.fullCounts = fullCounts; + this.priorityCounts = prioCounts; } public int termFreqDocCount() { return docCount; } - public int frequency(String keyword) { - return fullCounts.getOrDefault(keyword, 1); - } - - public int priorityFrequency(String keyword) { - return priorityCounts.getOrDefault(keyword, 1); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index 8f50c9fb..7cd95b96 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -15,15 +15,24 @@ public class SearchResultItem implements Comparable { * probably not what you want, use getDocumentId() instead */ public final long combinedId; + /** Encoded document metadata */ + public final long encodedDocMetadata; + + /** Encoded html features of document */ + + public final int htmlFeatures; + /** How did the subqueries match against the document ? */ public final List keywordScores; /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId) { + public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { this.combinedId = combinedId; + this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); + this.htmlFeatures = htmlFeatures; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index a0fd2156..212b2302 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery.model.results; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; @@ -10,34 +9,20 @@ public final class SearchResultKeywordScore { public final long termId; public final String keyword; private final long encodedWordMetadata; - private final long encodedDocMetadata; - - private final int htmlFeatures; public SearchResultKeywordScore(String keyword, long termId, - long encodedWordMetadata, - long encodedDocMetadata, - int htmlFeatures) { + long encodedWordMetadata) { this.termId = termId; this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; - this.encodedDocMetadata = encodedDocMetadata; - this.htmlFeatures = htmlFeatures; } public boolean hasTermFlag(WordFlags flag) { return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); } - public int positionCount() { - return Long.bitCount(positions()); - } - @Deprecated // FIXME 2024-04-06 - public int subquery() { - return -1; - } public long positions() { return WordMetadata.decodePositions(encodedWordMetadata); } @@ -46,44 +31,28 @@ public final class SearchResultKeywordScore { return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } - public boolean isKeywordRegular() { - return !keyword.contains(":") - && !hasTermFlag(WordFlags.Synthetic); - } - public long encodedWordMetadata() { return encodedWordMetadata; } - public long encodedDocMetadata() { - return encodedDocMetadata; - } - - public int htmlFeatures() { - return htmlFeatures; - } - @Override public boolean equals(Object obj) { if (obj == this) return true; if (obj == null || obj.getClass() != this.getClass()) return false; var that = (SearchResultKeywordScore) obj; - return Objects.equals(this.keyword, that.keyword) && - this.encodedWordMetadata == that.encodedWordMetadata && - this.encodedDocMetadata == that.encodedDocMetadata; + return Objects.equals(this.termId, that.termId); } @Override public int hashCode() { - return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata); + return Objects.hash(termId); } @Override public String toString() { return "SearchResultKeywordScore[" + "keyword=" + keyword + ", " + - "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " + - "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']'; + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']'; } } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index df25c494..3094699b 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -98,16 +98,16 @@ message RpcDecoratedResultItem { message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present int32 resultsFromDomain = 2; // number of other results from the same domain - repeated RpcResultKeywordScore keywordScores = 3; + int64 encodedDocMetadata = 3; // bit encoded document metadata + int32 htmlFeatures = 4; // bitmask encoding features of the document + repeated RpcResultKeywordScore keywordScores = 5; } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int64 encodedWordMetadata = 2; // bit encoded word metadata - int64 encodedDocMetadata = 3; // bit encoded document metadata - bool hasPriorityTerms = 4; // true if this word is important to the document - int32 htmlFeatures = 5; // bit encoded document features + bool hasPriorityTerms = 3; // true if this word is important to the document } /* Query execution parameters */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 36b611ff..fa0a8343 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -11,6 +11,7 @@ import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; @@ -135,14 +136,14 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var rawItem = RpcRawResultItem.newBuilder(); rawItem.setCombinedId(rawResult.combinedId); rawItem.setResultsFromDomain(rawResult.resultsFromDomain); + rawItem.setHtmlFeatures(rawResult.htmlFeatures); + rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( RpcResultKeywordScore.newBuilder() - .setEncodedDocMetadata(score.encodedDocMetadata()) .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) - .setHtmlFeatures(score.htmlFeatures()) ); } @@ -203,9 +204,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, - params.compiledQuery, - params.compiledQueryIds); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -414,22 +413,22 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, - CompiledQuery query, CompiledQueryLong compiledQueryIds) { - Map termToId = new HashMap<>(query.size()); - query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); - final Map termFrequencies = new HashMap<>(termToId.size()); - final Map prioFrequencies = new HashMap<>(termToId.size()); + int[] full = new int[compiledQueryIds.size()]; + int[] prio = new int[compiledQueryIds.size()]; - termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id))); - termToId.forEach((key, id) -> prioFrequencies.put(key, index.getTermFrequencyPrio(id))); + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { + long id = compiledQueryIds.at(idx); + full[idx] = index.getTermFrequency(id); + prio[idx] = index.getTermFrequencyPrio(id); + } return new ResultRankingContext(index.getTotalDocCount(), rankingParams, - termFrequencies, - prioFrequencies); + new CqDataInt(full), + new CqDataInt(prio)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 3777cf4f..89b4c543 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,7 +1,6 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -70,39 +69,42 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - SearchResultItem searchResult = new SearchResultItem(docId); + SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures); + + long[] wordMetas = new long[compiledQuery.size()]; + SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; + + for (int i = 0; i < wordMetas.length; i++) { + final long termId = compiledQueryIds.at(i); + final String term = compiledQuery.at(i); + + wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId); + scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]); + } - SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> - new SearchResultKeywordScore( - compiledQuery.at(idx), - compiledQueryIds.at(idx), - termMetadataForCombinedDocumentIds.getTermMetadata( - compiledQueryIds.at(idx), combinedId - ), - docMetadata, - htmlFeatures) - ) - .toArray(SearchResultKeywordScore[]::new); // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs // to be able to re-construct its own CompiledQuery for re-ranking the results. This is // a very flimsy assumption. searchResult.keywordScores.addAll(List.of(scores)); - CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); + CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); - int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); - int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { return null; } if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, + double score = searchResultValuator.calculateSearchResultValue( + wordMetasQuery, + docMetadata, + htmlFeatures, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -111,7 +113,7 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.AUTO || @@ -124,24 +126,24 @@ public class IndexResultValuationContext { docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Site.asBit()); + return WordFlags.Site.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Subjects.asBit()); + return WordFlags.Subjects.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Title.asBit()); + return WordFlags.Title.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlPath.asBit()); + return WordFlags.UrlPath.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); + return WordFlags.UrlDomain.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit()); + return WordFlags.ExternalLink.isPresent(wordMeta); } return true; } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index a84e5f4f..2fa44c31 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -6,16 +6,19 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.results.ResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -126,22 +129,31 @@ public class IndexResultValuatorService { continue; } - // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // Reconstruct the compiledquery for re-valuation // // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same // order as the data for the CompiledQuery. - CompiledQuery resultQuery = - new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + long[] wordMetas = new long[compiledQuery.size()]; + for (int i = 0; i < compiledQuery.size(); i++) { + var score = result.keywordScores.get(i); + wordMetas[i] = score.encodedWordMetadata(); + } - resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); + CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); + + resultItems.add(createCombinedItem( + result, + docData, + metaQuery, + rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, - CompiledQuery resultQuery, + CompiledQueryLong wordMetas, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -154,13 +166,19 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - bestPositions(resultQuery), - resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) + bestPositions(wordMetas), + + resultValuator.calculateSearchResultValue(wordMetas, + result.encodedDocMetadata, + result.htmlFeatures, + docData.wordsTotal(), + rankingContext) ); } - private long bestPositions(CompiledQuery resultQuery) { - LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions); + private long bestPositions(CompiledQueryLong wordMetas) { + LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); + int bestPc = 0; long bestPositions = 0; diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 862978c9..4d257349 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,9 +1,8 @@ package nu.marginalia.ranking.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentFlags; @@ -15,36 +14,32 @@ import com.google.inject.Singleton; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; - @Singleton public class ResultValuator { final static double scalingFactor = 500.; - private final Bm25Factor bm25Factor; private final TermCoherenceFactor termCoherenceFactor; private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); @Inject - public ResultValuator(Bm25Factor bm25Factor, - TermCoherenceFactor termCoherenceFactor) { - this.bm25Factor = bm25Factor; + public ResultValuator(TermCoherenceFactor termCoherenceFactor) { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(CompiledQuery scores, + public double calculateSearchResultValue(CompiledQueryLong wordMeta, + long documentMetadata, + int features, int length, ResultRankingContext ctx) { - if (scores.size() == 0) + if (wordMeta.isEmpty()) return Double.MAX_VALUE; - if (length < 0) - length = 5000; - long documentMetadata = scores.at(0).encodedDocMetadata(); - int features = scores.at(0).htmlFeatures(); + if (length < 0) { + length = 5000; + } + var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -79,9 +74,10 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); - double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); - double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta); + + double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java deleted file mode 100644 index bc13671e..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ /dev/null @@ -1,113 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.Bm25Parameters; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.model.idx.WordFlags; - -public class Bm25Factor { - private static final int AVG_LENGTH = 5000; - - /** This is an estimation of BM-25. - * - * @see Bm25Parameters - */ - public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = keyword.positionCount(); - - int freq = ctx.frequency(keyword.keyword); - - return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - }); - } - - /** Bm25 calculation, except instead of counting positions in the document, - * the number of relevance signals for the term is counted instead. - */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = evaluatePriorityScore(keyword); - - int freq = ctx.priorityFrequency(keyword.keyword); - - // note we override b to zero for priority terms as they are independent of document length - return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - }); - - } - - private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { - int pcount = keyword.positionCount(); - - double qcount = 0.; - - if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) { - - qcount += 2.5; - - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 2.5; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.5; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 1.25; - } - else { - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 3; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 0.5; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 0.5; - } - - if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) - qcount += 1.5; - - if (pcount > 2) { - if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; - } - - return qcount; - } - - /** - * - * @param docCount Number of documents - * @param freq Number of matching documents - */ - private double invFreq(int docCount, int freq) { - return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); - } - - /** - * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization - * @param count number of occurrences in the document - * @param length document length - */ - private double f(double k, double b, double count, int length) { - final double lengthRatio = (double) length / AVG_LENGTH; - - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java new file mode 100644 index 00000000..9c46261d --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java @@ -0,0 +1,81 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + private final int length; + + public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + int length, + ResultRankingContext ctx) { + this.length = length; + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); + + int freq = frequencies.get(idx); + + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + } + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java new file mode 100644 index 00000000..1fb26f6b --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java @@ -0,0 +1,127 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + + public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + ResultRankingContext ctx) { + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = evaluatePriorityScore(wordMetaData.get(idx)); + + int freq = frequencies.get(idx); + + // note we override b to zero for priority terms as they are independent of document length + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + } + + private static double evaluatePriorityScore(long wordMeta) { + int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta)); + + double qcount = 0.; + + if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) { + + qcount += 2.5; + + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 2.5; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1.5; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 1.25; + } + else { + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 3; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 0.5; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 0.5; + } + + if ((wordMeta & WordFlags.Title.asBit()) != 0) + qcount += 1.5; + + if (pcount > 2) { + if ((wordMeta & WordFlags.Subjects.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.NamesWords.asBit()) != 0) + qcount += 0.25; + if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0) + qcount += 0.5; + } + + return qcount; + } + + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index 71159c58..e617549d 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,16 +1,16 @@ package nu.marginalia.ranking.results.factors; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(CompiledQuery scores) { - long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); + public double calculate(CompiledQueryLong wordMetadataQuery) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, + score -> score >>> WordMetadata.POSITIONS_SHIFT); return bitsSetFactor(mask); } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 301b5e19..7b0a6a24 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -215,9 +215,7 @@ public class IndexQueryServiceIntegrationSmokeTest { Set years = new HashSet<>(); for (var res : rsp.results) { - for (var score : res.rawIndexResult.getKeywordScores()) { - years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata())); - } + years.add(DocumentMetadata.decodeYear(res.rawIndexResult.encodedDocMetadata)); } assertEquals(Set.of(1998), years); diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 948c5857..c605a0a8 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 243ae90d..a1b66b04 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,6 +1,8 @@ package nu.marginalia.ranking.results; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -31,30 +33,27 @@ class ResultValuatorTest { when(dict.docCount()).thenReturn(100_000); valuator = new ResultValuator( - new Bm25Factor(), new TermCoherenceFactor() ); } - CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountNoTitleSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountSubjectSet = CompiledQuery.just( + CqDataInt frequencyData = new CqDataInt(new int[] { 10 }); + + CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just( new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); + wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata); + + CompiledQueryLong highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; + + CompiledQueryLong highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; @Test @@ -63,12 +62,16 @@ class ResultValuatorTest { when(dict.getTermFreq("bob")).thenReturn(10); ResultRankingContext context = new ResultRankingContext(100000, ResultRankingParameters.sensibleDefaults(), - Map.of("bob", 10), Collections.emptyMap()); + frequencyData, + frequencyData); - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, 10_000, context); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, 10_000, context); + long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); + int features = 0; + + double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context); + double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context); System.out.println(titleOnlyLowCount); System.out.println(titleLongOnlyLowCount); diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index 028896d9..d0abe443 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -18,14 +18,23 @@ class TermCoherenceFactorTest { @Test public void testAllBitsSet() { var allPositionsSet = createSet( - WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK + ~0L, + ~0L ); - long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); + long mask = CompiledQueryAggregates.longBitmaskAggregate( + allPositionsSet, + SearchResultKeywordScore::positions + ); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(1.0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(1.0, + termCoherenceFactor.calculate( + allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) + ) + ); + } @Test @@ -38,7 +47,7 @@ class TermCoherenceFactorTest { assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(0, termCoherenceFactor.calculate(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); } @Test @SuppressWarnings("unchecked") @@ -90,7 +99,7 @@ class TermCoherenceFactorTest { for (int i = 0; i < positionMasks.length; i++) { keywords.add(new SearchResultKeywordScore("", 0, - new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); + new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode())); } return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index f5068d07..df24ec10 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -15,7 +15,7 @@ class NgramLexiconTest { } void addNgram(String... ngram) { - lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); + lexicon.incOrderedTitle(HasherGroup.ordered().rollingHash(ngram)); } @Test diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index a67582bd..faba9eb7 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -38,7 +38,7 @@ public class ClusteredUrlDetails implements Comparable { for (var keywordScore : urlDetail.resultItem.keywordScores) { if (keywordScore.isKeywordSpecial()) continue; - if (keywordScore.positionCount() == 0) + if (keywordScore.positions() == 0) continue; if (keywordScore.hasTermFlag(WordFlags.Title)) From 599e719ad44bb3c9939be3f583a490815998e7f0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Apr 2024 16:44:08 +0200 Subject: [PATCH 46/47] (index) Fix priority search terms This functionality fell into disrepair some while ago. It's supposed to allow non-mandatory search terms that boost the ranking if they are present in the document. --- .../api/searchquery/QueryProtobufCodec.java | 1 + .../model/results/SearchResultItem.java | 10 +++++++- .../api/src/main/protobuf/query-api.proto | 2 +- .../nu/marginalia/index/IndexGrpcService.java | 1 + .../index/results/IndexMetadataService.java | 23 +++++++++++++++++++ .../results/IndexResultValuationContext.java | 21 +++++++++++++++-- .../index/results/model/QuerySearchTerms.java | 3 +++ .../TermMetadataForCombinedDocumentIds.java | 11 ++++++++- .../IndexResultDomainDeduplicatorTest.java | 2 +- 9 files changed, 68 insertions(+), 6 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 5a43df1b..2907992d 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -138,6 +138,7 @@ public class QueryProtobufCodec { rawItem.getHtmlFeatures(), keywordScores, rawItem.getResultsFromDomain(), + rawItem.getHasPriorityTerms(), Double.NaN // Not set ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index 7cd95b96..ad8b8cb1 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -28,11 +28,17 @@ public class SearchResultItem implements Comparable { /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { + public boolean hasPrioTerm; + + public SearchResultItem(long combinedId, + long encodedDocMetadata, + int htmlFeatures, + boolean hasPrioTerm) { this.combinedId = combinedId; this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); this.htmlFeatures = htmlFeatures; + this.hasPrioTerm = hasPrioTerm; } @@ -85,4 +91,6 @@ public class SearchResultItem implements Comparable { return Long.compare(this.combinedId, o.combinedId); } + + } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index 3094699b..bae06e66 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -101,13 +101,13 @@ message RpcRawResultItem { int64 encodedDocMetadata = 3; // bit encoded document metadata int32 htmlFeatures = 4; // bitmask encoding features of the document repeated RpcResultKeywordScore keywordScores = 5; + bool hasPriorityTerms = 6; // true if this word is important to the document } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int64 encodedWordMetadata = 2; // bit encoded word metadata - bool hasPriorityTerms = 3; // true if this word is important to the document } /* Query execution parameters */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index fa0a8343..4810d625 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -138,6 +138,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { rawItem.setResultsFromDomain(rawResult.resultsFromDomain); rawItem.setHtmlFeatures(rawResult.htmlFeatures); rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); + rawItem.setHasPriorityTerms(rawResult.hasPrioTerm); for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index 977a87e7..ce23c3f2 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -43,6 +43,7 @@ public class IndexMetadataService { public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { LongArrayList termIdsList = new LongArrayList(); + LongArrayList termIdsPrio = new LongArrayList(); TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); @@ -52,8 +53,30 @@ public class IndexMetadataService { termToId.put(word, id); } + for (var term : searchQuery.searchTermsAdvice) { + if (termToId.containsKey(term)) { + continue; + } + + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termToId.put(term, id); + } + + for (var term : searchQuery.searchTermsPriority) { + if (termToId.containsKey(term)) { + continue; + } + + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termIdsPrio.add(id); + termToId.put(term, id); + } + return new QuerySearchTerms(termToId, new TermIdList(termIdsList), + new TermIdList(termIdsPrio), new TermCoherenceGroupList( searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList() ) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 89b4c543..a9d6b4a6 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -52,7 +52,8 @@ public class IndexResultValuationContext { this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); + this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, + searchTerms.termIdsAll); } private final long flagsFilterMask = @@ -69,7 +70,10 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures); + SearchResultItem searchResult = new SearchResultItem(docId, + docMetadata, + htmlFeatures, + hasPrioTerm(combinedId)); long[] wordMetas = new long[compiledQuery.size()]; SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; @@ -108,11 +112,24 @@ public class IndexResultValuationContext { 5000, // use a dummy value here as it's not present in the index rankingContext); + if (searchResult.hasPrioTerm) { + score = 0.75 * score; + } + searchResult.setScore(score); return searchResult; } + private boolean hasPrioTerm(long combinedId) { + for (var term : searchTerms.termIdsPrio.array()) { + if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) { + return true; + } + } + return false; + } + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, QueryStrategy queryStrategy) { diff --git a/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java b/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java index d72e0ea9..bbb7cf30 100644 --- a/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java +++ b/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java @@ -6,14 +6,17 @@ import nu.marginalia.index.results.model.ids.TermIdList; public class QuerySearchTerms { private final TObjectLongHashMap termToId; public final TermIdList termIdsAll; + public final TermIdList termIdsPrio; public final TermCoherenceGroupList coherences; public QuerySearchTerms(TObjectLongHashMap termToId, TermIdList termIdsAll, + TermIdList termIdsPrio, TermCoherenceGroupList coherences) { this.termToId = termToId; this.termIdsAll = termIdsAll; + this.termIdsPrio = termIdsPrio; this.coherences = coherences; } diff --git a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java index 9068dd69..3ef2f7ab 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java +++ b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java @@ -18,12 +18,21 @@ public class TermMetadataForCombinedDocumentIds { public long getTermMetadata(long termId, long combinedId) { var metaByCombinedId = termdocToMeta.get(termId); if (metaByCombinedId == null) { - logger.warn("Missing meta for term {}", termId); return 0; } return metaByCombinedId.get(combinedId); } + public boolean hasTermMeta(long termId, long combinedId) { + var metaByCombinedId = termdocToMeta.get(termId); + + if (metaByCombinedId == null) { + return false; + } + + return metaByCombinedId.get(combinedId) != 0; + } + public record DocumentsWithMetadata(Long2LongOpenHashMap data) { public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) { this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array())); diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index c605a0a8..21f6312e 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN, false); } } \ No newline at end of file From 2353c73c5730cc414d6aad59b6188a5cba198d6b Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 12:10:13 +0200 Subject: [PATCH 47/47] (encyclopedia) Index the full articles Previously, in an experimental change, only the first paragraph was indexed, intended to reduce the amount of noisy tangential hits. This was not a good idea, so the change is reverted. --- .../encyclopedia/EncyclopediaMarginaliaNuSideloader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index ca85455e..17c83250 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -125,7 +125,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC fullHtml.append("

    "); fullHtml.append(part); fullHtml.append("

    "); - break; // Only take the first part, this improves accuracy a lot } fullHtml.append("");