From 8c559c8121c4949bf97b7aab25e7df564ebe49a3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 12:37:18 +0200 Subject: [PATCH 01/90] (conf) Add additional logic for discovering system root --- .../config/java/nu/marginalia/WmsaHome.java | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index b61ee4dd..eff2e1c4 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -10,7 +10,6 @@ import java.util.stream.Stream; public class WmsaHome { public static UserAgent getUserAgent() { - return new UserAgent( System.getProperty("crawler.userAgentString", "Mozilla/5.0 (compatible; Marginalia-like bot; +https://git.marginalia.nu/))"), System.getProperty("crawler.userAgentIdentifier", "search.marginalia.nu") @@ -40,7 +39,19 @@ public class WmsaHome { .findFirst(); if (retStr.isEmpty()) { - // Check if we are running in a test environment + // Check parent directories for a fingerprint of the project's installation boilerplate + var prodRoot = Stream.iterate(Paths.get("").toAbsolutePath(), f -> f != null && Files.exists(f), Path::getParent) + .filter(p -> Files.exists(p.resolve("conf/properties/system.properties"))) + .filter(p -> Files.exists(p.resolve("model/tfreq-new-algo3.bin"))) + .findAny(); + if (prodRoot.isPresent()) { + return prodRoot.get(); + } + + // Check if we are running in a test environment by looking for fingerprints + // matching the base of the source tree for the project, then looking up the + // run directory which contains a template for the installation we can use as + // though it's the project root for testing purposes var testRoot = Stream.iterate(Paths.get("").toAbsolutePath(), f -> f != null && Files.exists(f), Path::getParent) .filter(p -> Files.exists(p.resolve("run/env"))) @@ -50,8 +61,8 @@ public class WmsaHome { return testRoot.orElseThrow(() -> new IllegalStateException(""" Could not find $WMSA_HOME, either set environment - variable, the 'system.homePath' property, - or ensure either /wmssa or /var/lib/wmsa exists + variable, the 'system.homePath' java property, + or ensure either /wmsa or /var/lib/wmsa exists """)); } From d2658d6f84cde648655c1be6e87803c0a8c8b2ae Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 13:25:15 +0200 Subject: [PATCH 02/90] (sys) Add springboard service that can spawn multiple different marginalia services to make distribution easier. --- .../single-service-runner/build.gradle | 33 +++++++++++++ .../java/nu/marginalia/SingleService.java | 49 +++++++++++++++++++ settings.gradle | 1 + 3 files changed, 83 insertions(+) create mode 100644 code/services-core/single-service-runner/build.gradle create mode 100644 code/services-core/single-service-runner/java/nu/marginalia/SingleService.java diff --git a/code/services-core/single-service-runner/build.gradle b/code/services-core/single-service-runner/build.gradle new file mode 100644 index 00000000..bec9eb54 --- /dev/null +++ b/code/services-core/single-service-runner/build.gradle @@ -0,0 +1,33 @@ +plugins { + id 'java' + id 'application' + id 'jvm-test-suite' +} + +application { + mainClass = 'nu.marginalia.SingleService' + applicationName = 'marginalia' +} + +tasks.distZip.enabled = false + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(22)) + } +} + +apply from: "$rootProject.projectDir/srcsets.gradle" + +dependencies { + implementation project(':code:services-core:query-service') + implementation project(':code:services-core:index-service') + implementation project(':code:services-core:control-service') + implementation project(':code:services-core:executor-service') + + testImplementation libs.bundles.slf4j.test + testImplementation libs.bundles.junit + testImplementation libs.mockito + +} + diff --git a/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java b/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java new file mode 100644 index 00000000..5a793a0d --- /dev/null +++ b/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java @@ -0,0 +1,49 @@ +package nu.marginalia; + +public class SingleService { + enum Service { + IndexService("index", "nu.marginalia.index.IndexMain"), + ControlService("control", "nu.marginalia.control.ControlMain"), + ExecutorService("executor", "nu.marginalia.executor.ExecutorMain"), + QueryService("query", "nu.marginalia.query.QueryMain"), + ; + + public final String name; + public final String className; + + Service(String name, String className) { + this.name = name; + this.className = className; + } + + /** Call the main method of the service class */ + public void run(String[] args) { + try { + // Use reflection to call the main method of the service class to avoid + // loading all classes at startup time, which would invoke a bunch of contradictory + // static initializers + + Class clazz = Class.forName(className); + clazz.getMethod("main", String[].class).invoke(null, (Object) args); + } catch (Exception e) { + e.printStackTrace(); + } + } + } + + public static void main(String... args) { + if (args.length == 0) { + System.out.println("Usage: SingleService [args...]"); + } + + String serviceName = args[0]; + String[] serviceArgs = new String[args.length - 1]; + System.arraycopy(args, 1, serviceArgs, 0, serviceArgs.length); + + for (var service : Service.values()) { + if (service.name.equals(serviceName)) { + service.run(serviceArgs); + } + } + } +} diff --git a/settings.gradle b/settings.gradle index cfee1a8b..6571020c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -5,6 +5,7 @@ include 'code:services-core:assistant-service' include 'code:services-core:control-service' include 'code:services-core:query-service' include 'code:services-core:executor-service' +include 'code:services-core:single-service-runner' include 'code:services-application:search-service' include 'code:services-application:api-service' From f434a8b492fbcacecd5cb96c589454e166403845 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 15:25:23 +0200 Subject: [PATCH 03/90] (build) Upgrade jib plugin version --- build.gradle | 2 +- code/services-application/api-service/build.gradle | 2 +- code/services-application/dating-service/build.gradle | 2 +- code/services-application/explorer-service/build.gradle | 2 +- code/services-application/search-service/build.gradle | 2 +- code/services-core/assistant-service/build.gradle | 2 +- code/services-core/control-service/build.gradle | 2 +- code/services-core/executor-service/build.gradle | 4 +--- code/services-core/index-service/build.gradle | 2 +- code/services-core/query-service/build.gradle | 2 +- code/tools/screenshot-capture-tool/build.gradle | 2 +- 11 files changed, 11 insertions(+), 13 deletions(-) diff --git a/build.gradle b/build.gradle index 9559cfc2..2b35ee4e 100644 --- a/build.gradle +++ b/build.gradle @@ -6,7 +6,7 @@ plugins { // This is a workaround for a bug in the Jib plugin that causes it to stall randomly // https://github.com/GoogleContainerTools/jib/issues/3347 - id 'com.google.cloud.tools.jib' version '3.4.1' apply(false) + id 'com.google.cloud.tools.jib' version '3.4.2' apply(false) } group 'marginalia' diff --git a/code/services-application/api-service/build.gradle b/code/services-application/api-service/build.gradle index 9fa51a9f..cb851a67 100644 --- a/code/services-application/api-service/build.gradle +++ b/code/services-application/api-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } java { diff --git a/code/services-application/dating-service/build.gradle b/code/services-application/dating-service/build.gradle index a8cfd6e1..b574c1f8 100644 --- a/code/services-application/dating-service/build.gradle +++ b/code/services-application/dating-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { diff --git a/code/services-application/explorer-service/build.gradle b/code/services-application/explorer-service/build.gradle index da7e8a2e..cbea1f2c 100644 --- a/code/services-application/explorer-service/build.gradle +++ b/code/services-application/explorer-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index d1a64e2e..54622609 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -5,7 +5,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index 3f51937f..d1550bcb 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index 787e3740..56c2be91 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -2,7 +2,7 @@ plugins { id 'java' id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } java { diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 08d80ff5..77b41a9e 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { @@ -13,8 +13,6 @@ application { tasks.distZip.enabled = false - - java { toolchain { languageVersion.set(JavaLanguageVersion.of(22)) diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 63b2ca5a..8a07c91a 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { diff --git a/code/services-core/query-service/build.gradle b/code/services-core/query-service/build.gradle index 7e80ac81..11f159bc 100644 --- a/code/services-core/query-service/build.gradle +++ b/code/services-core/query-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } application { diff --git a/code/tools/screenshot-capture-tool/build.gradle b/code/tools/screenshot-capture-tool/build.gradle index 27b7ee89..e2579be7 100644 --- a/code/tools/screenshot-capture-tool/build.gradle +++ b/code/tools/screenshot-capture-tool/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.1' + id 'com.google.cloud.tools.jib' version '3.4.2' } java { From cb82927756f26789a0b256a4807fd5a0417e9bb8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 12 Mar 2024 13:12:50 +0100 Subject: [PATCH 04/90] (WIP) Implement first take of new query segmentation algorithm --- code/functions/search-query/build.gradle | 5 + .../segmentation/BasicSentenceExtractor.java | 16 ++ .../searchquery/segmentation/HasherGroup.java | 61 +++++++ .../segmentation/NgramExporterMain.java | 46 +++++ .../segmentation/NgramExtractorMain.java | 113 ++++++++++++ .../segmentation/NgramLexicon.java | 165 ++++++++++++++++++ .../segmentation/HasherGroupTest.java | 33 ++++ .../segmentation/NgramLexiconTest.java | 53 ++++++ 8 files changed, 492 insertions(+) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java diff --git a/code/functions/search-query/build.gradle b/code/functions/search-query/build.gradle index dc1f9c4c..7b792b48 100644 --- a/code/functions/search-query/build.gradle +++ b/code/functions/search-query/build.gradle @@ -26,6 +26,9 @@ dependencies { implementation project(':code:libraries:term-frequency-dict') implementation project(':third-party:porterstemmer') + implementation project(':third-party:openzim') + implementation project(':third-party:commons-codec') + implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:features-convert:keyword-extraction') @@ -36,6 +39,8 @@ dependencies { implementation libs.bundles.grpc implementation libs.notnull implementation libs.guice + implementation libs.jsoup + implementation libs.commons.lang3 implementation libs.trove implementation libs.fastutil implementation libs.bundles.gson diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java new file mode 100644 index 00000000..e65c243d --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java @@ -0,0 +1,16 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import ca.rmen.porterstemmer.PorterStemmer; +import org.apache.commons.lang3.StringUtils; + +public class BasicSentenceExtractor { + + private static PorterStemmer porterStemmer = new PorterStemmer(); + public static String[] getStemmedParts(String sentence) { + String[] parts = StringUtils.split(sentence, ' '); + for (int i = 0; i < parts.length; i++) { + parts[i] = porterStemmer.stemWord(parts[i]); + } + return parts; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java new file mode 100644 index 00000000..60bbb4dd --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java @@ -0,0 +1,61 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import nu.marginalia.hash.MurmurHash3_128; + +/** A group of hash functions that can be used to hash a sequence of strings, + * that also has an inverse operation that can be used to remove a previously applied + * string from the sequence. */ +sealed interface HasherGroup { + /** Apply a hash to the accumulator */ + long apply(long acc, long add); + + /** Remove a hash that was added n operations ago from the accumulator, add a new one */ + long replace(long acc, long add, long rem, int n); + + /** Create a new hasher group that preserves the order of appleid hash functions */ + static HasherGroup ordered() { + return new OrderedHasher(); + } + + /** Create a new hasher group that does not preserve the order of applied hash functions */ + static HasherGroup unordered() { + return new UnorderedHasher(); + } + + /** Bake the words in the sentence into a hash successively using the group's apply function */ + default long rollingHash(String[] parts) { + long code = 0; + for (String part : parts) { + code = apply(code, hash(part)); + } + return code; + } + + MurmurHash3_128 hash = new MurmurHash3_128(); + /** Calculate the hash of a string */ + static long hash(String term) { + return hash.hashNearlyASCII(term); + } + + final class UnorderedHasher implements HasherGroup { + + public long apply(long acc, long add) { + return acc ^ add; + } + + public long replace(long acc, long add, long rem, int n) { + return acc ^ rem ^ add; + } + } + + final class OrderedHasher implements HasherGroup { + + public long apply(long acc, long add) { + return Long.rotateLeft(acc, 1) ^ add; + } + + public long replace(long acc, long add, long rem, int n) { + return Long.rotateLeft(acc, 1) ^ add ^ Long.rotateLeft(rem, n); + } + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java new file mode 100644 index 00000000..087345f6 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java @@ -0,0 +1,46 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.SentenceExtractor; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Scanner; + +public class NgramExporterMain { + + public static void main(String... args) throws IOException { + trial(); + } + + static void trial() throws IOException { + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + NgramLexicon lexicon = new NgramLexicon(); + lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin")); + + System.out.println("Loaded!"); + + var scanner = new Scanner(System.in); + for (;;) { + System.out.println("Enter a sentence: "); + String line = scanner.nextLine(); + System.out.println("."); + if (line == null) + break; + + String[] terms = BasicSentenceExtractor.getStemmedParts(line); + System.out.println("."); + + for (int i = 2; i< 8; i++) { + lexicon.findSegments(i, terms).forEach(p -> { + System.out.println(STR."\{Arrays.toString(p.project(terms))}: \{p.count()}"); + }); + } + + } + } + + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java new file mode 100644 index 00000000..0339b2c1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java @@ -0,0 +1,113 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import it.unimi.dsi.fastutil.longs.*; +import nu.marginalia.hash.MurmurHash3_128; +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.openzim.ZIMTypes.ZIMFile; +import org.openzim.ZIMTypes.ZIMReader; + +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.Executors; + +public class NgramExtractorMain { + static MurmurHash3_128 hash = new MurmurHash3_128(); + + public static void main(String... args) { + } + + private static List getNgramTerms(Document document) { + List terms = new ArrayList<>(); + + document.select("a[href]").forEach(e -> { + var href = e.attr("href"); + if (href.contains(":")) + return; + if (href.contains("/")) + return; + + var text = e.text().toLowerCase(); + if (!text.contains(" ")) + return; + + terms.add(text); + }); + + return terms; + } + + public static void dumpNgramsList( + Path zimFile, + Path ngramFile + ) throws IOException, InterruptedException { + ZIMReader reader = new ZIMReader(new ZIMFile(zimFile.toString())); + + PrintWriter printWriter = new PrintWriter(Files.newOutputStream(ngramFile, + StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)); + + LongOpenHashSet known = new LongOpenHashSet(); + + try (var executor = Executors.newWorkStealingPool()) { + reader.forEachArticles((title, body) -> { + executor.submit(() -> { + var terms = getNgramTerms(Jsoup.parse(body)); + synchronized (known) { + for (String term : terms) { + if (known.add(hash.hashNearlyASCII(term))) { + printWriter.println(term); + } + } + } + }); + + }, p -> true); + } + printWriter.close(); + } + + public static void dumpCounts(Path zimInputFile, + Path countsOutputFile) throws IOException, InterruptedException + { + ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); + + NgramLexicon lexicon = new NgramLexicon(); + + var orderedHasher = HasherGroup.ordered(); + var unorderedHasher = HasherGroup.unordered(); + + try (var executor = Executors.newWorkStealingPool()) { + reader.forEachArticles((title, body) -> { + executor.submit(() -> { + LongArrayList orderedHashes = new LongArrayList(); + LongArrayList unorderedHashes = new LongArrayList(); + + for (var sent : getNgramTerms(Jsoup.parse(body))) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + + orderedHashes.add(orderedHasher.rollingHash(terms)); + unorderedHashes.add(unorderedHasher.rollingHash(terms)); + } + + synchronized (lexicon) { + for (var hash : orderedHashes) { + lexicon.incOrdered(hash); + } + for (var hash : unorderedHashes) { + lexicon.addUnordered(hash); + } + } + }); + + }, p -> true); + } + + lexicon.saveCounts(countsOutputFile); + } + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java new file mode 100644 index 00000000..948347bf --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -0,0 +1,165 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; +import it.unimi.dsi.fastutil.longs.LongHash; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +class NgramLexicon { + private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( + 100_000_000, + new KeyIsAlreadyHashStrategy() + ); + private final LongOpenHashSet permutations = new LongOpenHashSet(); + + private static final HasherGroup orderedHasher = HasherGroup.ordered(); + private static final HasherGroup unorderedHasher = HasherGroup.unordered(); + + public List findSegments(int length, String... parts) { + // Don't look for ngrams longer than the sentence + if (parts.length < length) return List.of(); + + List positions = new ArrayList<>(); + + // Hash the parts + long[] hashes = new long[parts.length]; + for (int i = 0; i < hashes.length; i++) { + hashes[i] = HasherGroup.hash(parts[i]); + } + + long ordered = 0; + long unordered = 0; + int i = 0; + + // Prepare by combining up to length hashes + for (; i < length; i++) { + ordered = orderedHasher.apply(ordered, hashes[i]); + unordered = unorderedHasher.apply(unordered, hashes[i]); + } + + // Slide the window and look for matches + for (;; i++) { + int ct = counts.get(ordered); + + if (ct > 0) { + positions.add(new SentenceSegment(i - length, length, ct, PositionType.NGRAM)); + } + else if (permutations.contains(unordered)) { + positions.add(new SentenceSegment(i - length, length, 0, PositionType.PERMUTATION)); + } + + if (i >= hashes.length) + break; + + // Remove the oldest hash and add the new one + ordered = orderedHasher.replace(ordered, + hashes[i], + hashes[i - length], + length); + unordered = unorderedHasher.replace(unordered, + hashes[i], + hashes[i - length], + length); + } + + return positions; + } + + public void incOrdered(long hashOrdered) { + counts.addTo(hashOrdered, 1); + } + public void addUnordered(long hashUnordered) { + permutations.add(hashUnordered); + } + + public void loadCounts(Path path) throws IOException { + try (var dis = new DataInputStream(Files.newInputStream(path))) { + long size = dis.readInt(); + + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } + } + } + + public void loadPermutations(Path path) throws IOException { + try (var dis = new DataInputStream(Files.newInputStream(path))) { + long size = dis.readInt(); + + for (int i = 0; i < size; i++) { + permutations.add(dis.readLong()); + } + } + } + + public void saveCounts(Path file) throws IOException { + try (var dos = new DataOutputStream(Files.newOutputStream(file, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE))) { + dos.writeInt(counts.size()); + + counts.forEach((k, v) -> { + try { + dos.writeLong(k); + dos.writeInt(v); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + public void savePermutations(Path file) throws IOException { + try (var dos = new DataOutputStream(Files.newOutputStream(file, + StandardOpenOption.CREATE, + StandardOpenOption.TRUNCATE_EXISTING, + StandardOpenOption.WRITE))) { + dos.writeInt(counts.size()); + + permutations.forEach(v -> { + try { + dos.writeLong(v); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + } + } + public void clear() { + permutations.clear(); + counts.clear(); + } + + public record SentenceSegment(int start, int length, int count, PositionType type) { + public String[] project(String... parts) { + return Arrays.copyOfRange(parts, start, start + length); + } + } + + enum PositionType { + NGRAM, PERMUTATION + } + + private static class KeyIsAlreadyHashStrategy implements LongHash.Strategy { + @Override + public int hashCode(long l) { + return (int) l; + } + + @Override + public boolean equals(long l, long l1) { + return l == l1; + } + } + +} + diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java new file mode 100644 index 00000000..174bd553 --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class HasherGroupTest { + + @Test + void ordered() { + long a = 5; + long b = 3; + long c = 2; + + var group = HasherGroup.ordered(); + assertNotEquals(group.apply(a, b), group.apply(b, a)); + assertEquals(group.apply(b,c), group.replace(group.apply(a, b), c, a, 2)); + } + + @Test + void unordered() { + long a = 5; + long b = 3; + long c = 2; + + var group = HasherGroup.unordered(); + + assertEquals(group.apply(a, b), group.apply(b, a)); + assertEquals(group.apply(b, c), group.replace(group.apply(a, b), c, a, 2)); + } + + +} diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java new file mode 100644 index 00000000..28b9ef2f --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java @@ -0,0 +1,53 @@ +package nu.marginalia.functions.searchquery.segmentation; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class NgramLexiconTest { + NgramLexicon lexicon = new NgramLexicon(); + @BeforeEach + public void setUp() { + lexicon.clear(); + } + + void addNgram(String... ngram) { + lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); + lexicon.addUnordered(HasherGroup.unordered().rollingHash(ngram)); + } + + @Test + void findSegments() { + addNgram("hello", "world"); + addNgram("rye", "bread"); + addNgram("rye", "world"); + + String[] sent = { "hello", "world", "rye", "bread" }; + var segments = lexicon.findSegments(2, "hello", "world", "rye", "bread"); + + assertEquals(3, segments.size()); + + for (int i = 0; i < 3; i++) { + var segment = segments.get(i); + switch (i) { + case 0 -> { + assertArrayEquals(new String[]{"hello", "world"}, segment.project(sent)); + assertEquals(1, segment.count()); + assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + } + case 1 -> { + assertArrayEquals(new String[]{"world", "rye"}, segment.project(sent)); + assertEquals(0, segment.count()); + assertEquals(NgramLexicon.PositionType.PERMUTATION, segment.type()); + } + case 2 -> { + assertArrayEquals(new String[]{"rye", "bread"}, segment.project(sent)); + assertEquals(1, segment.count()); + assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + } + } + } + + } +} \ No newline at end of file From 04879c005dcf0ab6c2cfc2bd5305992c44e37f70 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:16:00 +0100 Subject: [PATCH 05/90] (WIP) Improve data extraction from wikipedia data --- .../segmentation/NgramExtractorMain.java | 54 +++++++++++++++++-- .../segmentation/NgramLexicon.java | 2 +- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java index 0339b2c1..4cd4b296 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java @@ -22,9 +22,15 @@ public class NgramExtractorMain { public static void main(String... args) { } - private static List getNgramTerms(Document document) { + private static List getNgramTerms(String title, Document document) { List terms = new ArrayList<>(); + // Add the title + if (title.contains(" ")) { + terms.add(title.toLowerCase()); + } + + // Grab all internal links document.select("a[href]").forEach(e -> { var href = e.attr("href"); if (href.contains(":")) @@ -39,6 +45,43 @@ public class NgramExtractorMain { terms.add(text); }); + // Grab all italicized text + document.getElementsByTag("i").forEach(e -> { + var text = e.text().toLowerCase(); + if (!text.contains(" ")) + return; + + terms.add(text); + }); + + // Trim the discovered terms + terms.replaceAll(s -> { + + // Remove trailing parentheses and their contents + if (s.endsWith(")")) { + int idx = s.lastIndexOf('('); + if (idx > 0) { + return s.substring(0, idx).trim(); + } + } + + // Remove leading "list of " + if (s.startsWith("list of ")) { + return s.substring("list of ".length()); + } + + return s; + }); + + // Remove terms that are too short or too long + terms.removeIf(s -> { + if (!s.contains(" ")) + return true; + if (s.length() > 64) + return true; + return false; + }); + return terms; } @@ -56,7 +99,7 @@ public class NgramExtractorMain { try (var executor = Executors.newWorkStealingPool()) { reader.forEachArticles((title, body) -> { executor.submit(() -> { - var terms = getNgramTerms(Jsoup.parse(body)); + var terms = getNgramTerms(title, Jsoup.parse(body)); synchronized (known) { for (String term : terms) { if (known.add(hash.hashNearlyASCII(term))) { @@ -72,7 +115,9 @@ public class NgramExtractorMain { } public static void dumpCounts(Path zimInputFile, - Path countsOutputFile) throws IOException, InterruptedException + Path countsOutputFile, + Path permutationsOutputFile + ) throws IOException, InterruptedException { ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); @@ -87,7 +132,7 @@ public class NgramExtractorMain { LongArrayList orderedHashes = new LongArrayList(); LongArrayList unorderedHashes = new LongArrayList(); - for (var sent : getNgramTerms(Jsoup.parse(body))) { + for (var sent : getNgramTerms(title, Jsoup.parse(body))) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); orderedHashes.add(orderedHasher.rollingHash(terms)); @@ -108,6 +153,7 @@ public class NgramExtractorMain { } lexicon.saveCounts(countsOutputFile); + lexicon.savePermutations(permutationsOutputFile); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java index 948347bf..f8044e12 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -14,7 +14,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -class NgramLexicon { +public class NgramLexicon { private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( 100_000_000, new KeyIsAlreadyHashStrategy() From 760b80659d96308b61d65f99ef498d306d99dcf1 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:16:49 +0100 Subject: [PATCH 06/90] (WIP) Partial integration of new query expansion code into the query-serivice --- .../query_parser/QueryVariants.java | 187 +------------- .../variant/ExpansionStrategy.java | 7 + .../query_parser/variant/QueryExpansion.java | 111 ++++++++ .../query_parser/variant/QueryVariant.java | 17 ++ .../query_parser/variant/QueryVariantSet.java | 21 ++ .../query_parser/variant/QueryWord.java | 10 + .../query_parser/variant/VariantStrategy.java | 8 + .../query_parser/variant/model/QWord.java | 47 ++++ .../variant/model/QWordGraph.java | 236 ++++++++++++++++++ .../variant/strategy/CombineDashes.java | 40 +++ .../variant/strategy/JoinTerms.java | 58 +++++ .../variant/strategy/SplitWordNum.java | 65 +++++ .../searchquery/svc/QueryFactory.java | 11 +- .../variant/model/QWordGraphTest.java | 33 +++ .../query/svc/QueryFactoryTest.java | 3 +- 15 files changed, 666 insertions(+), 188 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java create mode 100644 code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java index 9732e53f..10648486 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java @@ -1,17 +1,14 @@ package nu.marginalia.functions.searchquery.query_parser; -import ca.rmen.porterstemmer.PorterStemmer; -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.LanguageModels; import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; @@ -22,17 +19,13 @@ import java.util.regex.Pattern; public class QueryVariants { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; - private final PorterStemmer ps = new PorterStemmer(); - private final NGramBloomFilter nGramBloomFilter; private final EnglishDictionary englishDictionary; private final ThreadLocal sentenceExtractor; public QueryVariants(LanguageModels lm, TermFrequencyDict dict, - NGramBloomFilter nGramBloomFilter, EnglishDictionary englishDictionary) { - this.nGramBloomFilter = nGramBloomFilter; this.englishDictionary = englishDictionary; this.keywordExtractor = new KeywordExtractor(); this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); @@ -40,33 +33,6 @@ public class QueryVariants { } - final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - final Pattern dashBoundary = Pattern.compile("-"); - - @AllArgsConstructor - private static class Word { - public final String stemmed; - public final String word; - public final String wordOriginal; - } - - @AllArgsConstructor @Getter @ToString @EqualsAndHashCode - public static class QueryVariant { - public final List terms; - public final double value; - } - - @Getter @ToString - public static class QueryVariantSet { - final List faithful = new ArrayList<>(); - final List alternative = new ArrayList<>(); - - final List nonLiterals = new ArrayList<>(); - - public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); - } - } public QueryVariantSet getQueryVariants(List query) { final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); @@ -108,19 +74,11 @@ public class QueryVariants { byStart.put(0, elongatedFirstWords); } - final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); + final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); List> faithfulQueries = new ArrayList<>(); List> alternativeQueries = new ArrayList<>(); - for (var ls : goodSpans) { - faithfulQueries.addAll(createTokens(ls)); - } - - for (var span : goodSpans) { - alternativeQueries.addAll(joinTerms(span)); - } - for (var ls : goodSpans) { var last = ls.get(ls.size() - 1); @@ -174,105 +132,8 @@ public class QueryVariants { return ret; } - private Collection> createTokens(List ls) { - List asTokens = new ArrayList<>(); - List> ret = new ArrayList<>(); - - - boolean dash = false; - boolean num = false; - - for (var span : ls) { - dash |= dashBoundary.matcher(span.word).find(); - num |= numWordBoundary.matcher(span.word).find(); - if (ls.size() == 1 || !isOmittableWord(span.word)) { - asTokens.add(span.word); - } - } - ret.add(asTokens); - - if (dash) { - ret.addAll(combineDashWords(ls)); - } - - if (num) { - ret.addAll(splitWordNum(ls)); - } - - return ret; - } - - private boolean isOmittableWord(String word) { - return switch (word) { - case "vs", "or", "and", "versus", "is", "the", "why", "when", "if", "who", "are", "am" -> true; - default -> false; - }; - } - - private Collection> splitWordNum(List ls) { - List asTokens2 = new ArrayList<>(); - - boolean num = false; - - for (var span : ls) { - var wordMatcher = numWordBoundary.matcher(span.word); - var stemmedMatcher = numWordBoundary.matcher(span.stemmed); - - int ws = 0; - int ss = 0; - boolean didSplit = false; - while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { - ws = wordMatcher.start()+1; - ss = stemmedMatcher.start()+1; - if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) - || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) - { - String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); - asTokens2.add(combined); - didSplit = true; - num = true; - } - } - - if (!didSplit) { - asTokens2.add(span.word); - } - } - - if (num) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private Collection> combineDashWords(List ls) { - List asTokens2 = new ArrayList<>(); - boolean dash = false; - - for (var span : ls) { - var matcher = dashBoundary.matcher(span.word); - if (matcher.find() && nGramBloomFilter.isKnownNGram(ps.stemWord(dashBoundary.matcher(span.word).replaceAll("")))) { - dash = true; - String combined = dashBoundary.matcher(span.word).replaceAll(""); - asTokens2.add(combined); - } - else { - asTokens2.add(span.word); - } - } - - if (dash) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private String splitAtNumBoundary(String in, int splitPoint, String joiner) { - return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); - } - - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { - List> goodSpans = new ArrayList<>(); + private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { + List> goodSpans = new ArrayList<>(); for (int i = 0; i < 1; i++) { var spans = byStart.get(i); @@ -298,9 +159,9 @@ public class QueryVariants { int end = span.get(span.size()-1).end; if (end == sentence.length()) { - var gs = new ArrayList(span.size()); + var gs = new ArrayList(span.size()); for (var s : span) { - gs.add(new Word(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), + gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), s.size() == 1 ? sentence.words[s.start] : "")); } goodSpans.add(gs); @@ -325,38 +186,6 @@ public class QueryVariants { return goodSpans; } - private List> joinTerms(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = ps.stemWord(a.word + b.word); - - double scoreCombo = dict.getTermFreqStemmed(stemmed); - if (scoreCombo > 10000) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = a.word + b.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - } - - return ret; - } private JoinedQueryAndNonLiteralTokens joinQuery(List query) { StringJoiner s = new StringJoiner(" "); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java new file mode 100644 index 00000000..18987aea --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; + +public interface ExpansionStrategy { + void expand(QWordGraph graph); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java new file mode 100644 index 00000000..faac81d4 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -0,0 +1,111 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; +import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; +import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import org.apache.commons.lang3.StringUtils; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Pattern; + +public class QueryExpansion { + private static final PorterStemmer ps = new PorterStemmer(); + private final TermFrequencyDict dict; + private final NgramLexicon lexicon; + List expansionStrategies = List.of( + this::joinDashes, + this::splitWordNum, + this::joinTerms, + this::createSegments + ); + + public QueryExpansion(TermFrequencyDict dict, + NgramLexicon lexicon + ) { + this.dict = dict; + this.lexicon = lexicon; + } + + public QWordGraph expandQuery(List words) { + + QWordGraph graph = new QWordGraph(words); + + for (var strategy : expansionStrategies) { + strategy.expand(graph); + } + + return null; + } + + private static final Pattern dashPattern = Pattern.compile("-"); + private static final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); + + // Turn 'lawn-chair' into 'lawnchair' + public void joinDashes(QWordGraph graph) { + for (var qw : graph) { + if (qw.word().contains("-")) { + var joined = StringUtils.join(dashPattern.split(qw.word())); + graph.addVariant(qw, joined); + } + } + } + + + // Turn 'MP3' into 'MP-3' + public void splitWordNum(QWordGraph graph) { + for (var qw : graph) { + var matcher = numWordBoundary.matcher(qw.word()); + if (matcher.matches()) { + var joined = StringUtils.join(dashPattern.split(qw.word()), '-'); + graph.addVariant(qw, joined); + } + } + } + + // Turn 'lawn chair' into 'lawnchair' + public void joinTerms(QWordGraph graph) { + QWord prev = null; + + for (var qw : graph) { + if (prev != null) { + var joinedWord = prev.word() + qw.word(); + var joinedStemmed = ps.stemWord(joinedWord); + + var scoreA = dict.getTermFreqStemmed(prev.stemmed()); + var scoreB = dict.getTermFreqStemmed(qw.stemmed()); + + var scoreCombo = dict.getTermFreqStemmed(joinedStemmed); + + if (scoreCombo > scoreA + scoreB || scoreCombo > 1000) { + graph.addVariantForSpan(prev, qw, joinedWord); + } + } + + prev = qw; + } + } + + public void createSegments(QWordGraph graph) { + List nodes = new ArrayList<>(); + + for (var qw : graph) { + nodes.add(qw); + } + + String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + + for (int length = 2; length < Math.min(10, words.length); length++) { + for (var segment : lexicon.findSegments(length, words)) { + int start = segment.start(); + int end = segment.start() + segment.length(); + var word = StringUtils.join(words, "_", start, end); + + graph.addVariantForSpan(nodes.get(start), nodes.get(end), word); + } + } + } + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java new file mode 100644 index 00000000..8d24387b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java @@ -0,0 +1,17 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.AllArgsConstructor; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import lombok.ToString; + +import java.util.List; + +@AllArgsConstructor +@Getter +@ToString +@EqualsAndHashCode +public class QueryVariant { + public final List terms; + public final double value; +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java new file mode 100644 index 00000000..b01fbd5e --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java @@ -0,0 +1,21 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.Getter; +import lombok.ToString; +import nu.marginalia.functions.searchquery.query_parser.token.Token; + +import java.util.ArrayList; +import java.util.List; + +@Getter +@ToString +public class QueryVariantSet { + public final List faithful = new ArrayList<>(); + public final List alternative = new ArrayList<>(); + + public final List nonLiterals = new ArrayList<>(); + + public boolean isEmpty() { + return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java new file mode 100644 index 00000000..9c158a43 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java @@ -0,0 +1,10 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import lombok.AllArgsConstructor; + +@AllArgsConstructor +public class QueryWord { + public final String stemmed; + public final String word; + public final String wordOriginal; +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java new file mode 100644 index 00000000..2c1a5bfb --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java @@ -0,0 +1,8 @@ +package nu.marginalia.functions.searchquery.query_parser.variant; + +import java.util.Collection; +import java.util.List; + +public interface VariantStrategy { + Collection> constructVariants(List ls); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java new file mode 100644 index 00000000..07f65c95 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java @@ -0,0 +1,47 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import ca.rmen.porterstemmer.PorterStemmer; + +public record QWord( + int ord, + boolean variant, + String stemmed, + String word, + String original) +{ + + // These are special words that are not in the input, but are added to the graph, + // note the space around the ^ and $, to avoid collisions with real words + private static final String BEG_MARKER = " ^ "; + private static final String END_MARKER = " $ "; + + private static final PorterStemmer ps = new PorterStemmer(); + + public boolean isBeg() { + return word.equals(BEG_MARKER); + } + + public boolean isEnd() { + return word.equals(END_MARKER); + } + + public static QWord beg() { + return new QWord(Integer.MIN_VALUE, false, BEG_MARKER, BEG_MARKER, BEG_MARKER); + } + + public static QWord end() { + return new QWord(Integer.MAX_VALUE, false, END_MARKER, END_MARKER, END_MARKER); + } + + public boolean isOriginal() { + return !variant; + } + + public QWord(int ord, String word) { + this(ord, false, ps.stemWord(word), word, word); + } + + public QWord(int ord, QWord original, String word) { + this(ord, true, ps.stemWord(word), word, original.original); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java new file mode 100644 index 00000000..f9902733 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java @@ -0,0 +1,236 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import org.jetbrains.annotations.NotNull; + +import java.util.*; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** Graph structure for constructing query variants. The graph should be a directed acyclic graph, + * with a single start node and a single end node, denoted by QWord.beg() and QWord.end() respectively. + *

+ * Naively, every path from the start to the end node should represent a valid query variant, although in + * practice it is desirable to be clever about how to evaluate the paths, to avoid combinatorial explosion. + */ +public class QWordGraph implements Iterable { + + + public record QWordGraphLink(QWord from, QWord to) { + } + + private final List links = new ArrayList<>(); + private final Map> fromTo = new HashMap<>(); + private final Map> toFrom = new HashMap<>(); + + private int wordId = 0; + + public QWordGraph(String... words) { + this(List.of(words)); + } + + public QWordGraph(List words) { + QWord beg = QWord.beg(); + QWord end = QWord.end(); + + var prev = beg; + + for (String s : words) { + var word = new QWord(wordId++, s); + addLink(prev, word); + prev = word; + } + + addLink(prev, end); + } + + public void addVariant(QWord original, String word) { + var siblings = getVariants(original); + if (siblings.stream().anyMatch(w -> w.word().equals(word))) + return; + + var newWord = new QWord(wordId++, original, word); + + for (var prev : getPrev(original)) + addLink(prev, newWord); + for (var next : getNext(original)) + addLink(newWord, next); + } + + public void addVariantForSpan(QWord first, QWord last, String word) { + var newWord = new QWord(wordId++, first, word); + + for (var prev : getPrev(first)) + addLink(prev, newWord); + for (var next : getNext(last)) + addLink(newWord, next); + } + + public List getVariants(QWord original) { + var prevNext = getPrev(original).stream() + .flatMap(prev -> getNext(prev).stream()) + .collect(Collectors.toSet()); + + return getNext(original).stream() + .flatMap(next -> getPrev(next).stream()) + .filter(prevNext::contains) + .collect(Collectors.toList()); + } + + + public void addLink(QWord from, QWord to) { + links.add(new QWordGraphLink(from, to)); + fromTo.computeIfAbsent(from, k -> new ArrayList<>()).add(to); + toFrom.computeIfAbsent(to, k -> new ArrayList<>()).add(from); + } + + public List links() { + return Collections.unmodifiableList(links); + } + public List nodes() { + return links.stream() + .flatMap(l -> Stream.of(l.from(), l.to())) + .sorted(Comparator.comparing(QWord::ord)) + .distinct() + .collect(Collectors.toList()); + } + + + public List getNext(QWord word) { + return fromTo.getOrDefault(word, List.of()); + } + public List getNextOriginal(QWord word) { + return fromTo.getOrDefault(word, List.of()) + .stream() + .filter(QWord::isOriginal) + .toList(); + } + + public List getPrev(QWord word) { + return toFrom.getOrDefault(word, List.of()); + } + public List getPrevOriginal(QWord word) { + return toFrom.getOrDefault(word, List.of()) + .stream() + .filter(QWord::isOriginal) + .toList(); + } + + // Returns true if removing the word would disconnect the graph + // so that there is no path from 'begin' to 'end'. This is useful + // in breaking up the graph into smaller component subgraphs, and + // understanding which vertexes can be re-ordered without changing + // the semantics of the encoded query. + public boolean isBypassed(QWord word, QWord begin, QWord end) { + assert word.isOriginal() : "Can only bypass original words"; + + Set edge = new HashSet<>(); + Set visited = new HashSet<>(); + + edge.add(begin); + + while (!edge.isEmpty()) { + Set next = new HashSet<>(); + + for (var w : edge) { + // Skip the word we're trying find a bypassing route for + if (w.ord() == word.ord()) + continue; + + if (Objects.equals(w, end)) + return true; + + next.addAll(getNext(w)); + } + + next.removeAll(visited); + visited.addAll(next); + edge = next; + } + + return false; + } + + /** Returns a set of all nodes that are between 'begin' and 'end' in the graph, + * including the terminal nodes. This is useful for breaking up the graph into + * smaller components that can be evaluated in any order. + *

+ * It is assumed that there is a path from 'begin' to 'end' in the graph, and no + * other paths that bypass 'end'. + *

+ * The nodes are returned in the order they are encountered in a breadth-first search. + */ + public List nodesBetween(QWord begin, QWord end) { + List edge = new ArrayList<>(); + List visited = new ArrayList<>(); + + edge.add(begin); + + while (!edge.isEmpty()) { + List next = new ArrayList<>(); + + for (var w : edge) { + if (Objects.equals(w, end)) + continue; + + assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex"; + + next.addAll(getNext(w)); + } + + next.removeAll(visited); + visited.addAll(next); + edge = next; + } + + return visited; + } + + /** Returns a list of subgraphs that are connected on the path from + * 'begin' to 'end'. This is useful for breaking up the graph into + * smaller components that can be evaluated in any order. + *

+ * The subgraphs are specified by their predecessor and successor nodes, + * + */ + public List getSubgraphs(QWord begin, QWord end) { + // Short-circuit for the common and simple case + if (getNext(begin).equals(List.of(end))) + return List.of(new QWordGraphLink(begin, end)); + + List subgraphs = new ArrayList<>(); + + List points = nodesBetween(begin, end) + .stream() + .filter(w -> isBypassed(w, begin, end)) + .toList(); + + for (int i = 0; i < points.size() - 1; i++) { + var a = points.get(i); + var b = points.get(i+1); + + subgraphs.add(new QWordGraphLink(a, b)); + } + + return subgraphs; + } + + + @NotNull + @Override + public Iterator iterator() { + return new Iterator<>() { + QWord pos = QWord.beg(); + + @Override + public boolean hasNext() { + return !pos.isEnd(); + } + + @Override + public QWord next() { + pos = getNextOriginal(pos).get(0); + return pos; + } + }; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java new file mode 100644 index 00000000..c24defbe --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java @@ -0,0 +1,40 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** Variant strategy that combines word that have dashes, as sometimes lawn-chair + * gets spelled lawnchair */ +public class CombineDashes implements VariantStrategy { + final Pattern dashBoundary = Pattern.compile("-"); + + public CombineDashes() { + } + + @Override + public Collection> constructVariants(List words) { + List asTokens2 = new ArrayList<>(); + boolean dash = false; + + for (var span : words) { + var matcher = dashBoundary.matcher(span.word); + if (matcher.find()) { + String combined = dashBoundary.matcher(span.word).replaceAll(""); + asTokens2.add(combined); + } + + asTokens2.add(span.word); + } + + if (dash) { + return List.of(asTokens2); + } + return Collections.emptyList(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java new file mode 100644 index 00000000..d03a64d1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java @@ -0,0 +1,58 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +/** Variant strategy that merges tokens that are adjacent, where the combined token + * has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */ +public class JoinTerms implements VariantStrategy { + private final TermFrequencyDict dict; + private final PorterStemmer ps; + + public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) { + this.dict = dict; + this.ps = ps; + } + + @Override + public Collection> constructVariants(List span) { + List> ret = new ArrayList<>(); + + for (int i = 0; i < span.size()-1; i++) { + var a = span.get(i); + var b = span.get(i+1); + + var stemmed = ps.stemWord(a.word + b.word); + + double scoreCombo = dict.getTermFreqStemmed(stemmed); + + if (scoreCombo > 10000) { + List asTokens = new ArrayList<>(); + + for (int j = 0; j < i; j++) { + var word = span.get(j).word; + asTokens.add(word); + } + { + var word = a.word + b.word; + asTokens.add(word); + } + for (int j = i+2; j < span.size(); j++) { + var word = span.get(j).word; + asTokens.add(word); + } + + ret.add(asTokens); + } + + } + + return ret; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java new file mode 100644 index 00000000..ac79476b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java @@ -0,0 +1,65 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.strategy; + +import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; +import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; +import nu.marginalia.util.ngrams.NGramBloomFilter; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.regex.Pattern; + +/** Variant strategy that splits tokens at the boundary between a number and a word. + */ +public class SplitWordNum implements VariantStrategy { + + + final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); + private final NGramBloomFilter nGramBloomFilter; + + public SplitWordNum(NGramBloomFilter nGramBloomFilter) { + this.nGramBloomFilter = nGramBloomFilter; + } + + @Override + public Collection> constructVariants(List ls) { + List asTokens2 = new ArrayList<>(); + + boolean num = false; + + for (var span : ls) { + var wordMatcher = numWordBoundary.matcher(span.word); + var stemmedMatcher = numWordBoundary.matcher(span.stemmed); + + int ws = 0; + int ss = 0; + boolean didSplit = false; + while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { + ws = wordMatcher.start()+1; + ss = stemmedMatcher.start()+1; + if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) + || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) + { + String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); + asTokens2.add(combined); + didSplit = true; + num = true; + } + } + + if (!didSplit) { + asTokens2.add(span.word); + } + } + + if (num) { + return List.of(asTokens2); + } + return Collections.emptyList(); + } + + private String splitAtNumBoundary(String in, int splitPoint, String joiner) { + return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index ac7ce2b2..9ac7c795 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -8,7 +8,6 @@ import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.util.language.EnglishDictionary; import nu.marginalia.language.WordPatterns; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; @@ -37,9 +36,8 @@ public class QueryFactory { @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, - EnglishDictionary englishDictionary, - NGramBloomFilter nGramBloomFilter) { - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, nGramBloomFilter, englishDictionary)); + EnglishDictionary englishDictionary) { + this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary)); } @@ -79,7 +77,7 @@ public class QueryFactory { String domain = null; - var basicQuery = queryParser.parse(query); + List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { problems.add("Your search query is too long"); @@ -108,10 +106,9 @@ public class QueryFactory { for (var parts : queryPermutations) { QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts); - SearchSubquery subquery = termsAccumulator.createSubquery(); - domain = termsAccumulator.domain; + SearchSubquery subquery = termsAccumulator.createSubquery(); subqueries.add(subquery); } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java new file mode 100644 index 00000000..a88e4d63 --- /dev/null +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java @@ -0,0 +1,33 @@ +package nu.marginalia.functions.searchquery.query_parser.variant.model; + +import org.junit.jupiter.api.Test; + +class QWordGraphTest { + + @Test + public void testAddConstructor() { + QWordGraph graph = new QWordGraph("hello", "world"); + + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + System.out.println("--"); + graph.addVariant(graph.nodes().get(1), "sup"); + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println("--"); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + + graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); + System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); + System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println("--"); + graph.links().forEach(System.out::println); + System.out.println("--"); + graph.nodes().forEach(System.out::println); + } +} \ No newline at end of file diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index fe93a1f6..4020d6e0 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -32,8 +32,7 @@ public class QueryFactoryTest { queryFactory = new QueryFactory(lm, tfd, - new EnglishDictionary(tfd), - new NGramBloomFilter(lm) + new EnglishDictionary(tfd) ); } From 212d10172795b1ce17a8fd5a09f607160cd20afb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 18 Mar 2024 13:45:23 +0100 Subject: [PATCH 07/90] (control) GUI for exporting segmentation data from a wikipedia zim --- .../executor/client/ExecutorExportClient.java | 14 +++-- .../api/src/main/protobuf/executor-api.proto | 4 ++ code/execution/build.gradle | 1 + .../nu/marginalia/actor/ExecutorActor.java | 1 + .../actor/ExecutorActorControlService.java | 2 + .../task/ExportSegmentationModelActor.java | 55 +++++++++++++++++++ .../execution/ExecutorExportGrpcService.java | 23 ++++++-- .../node/svc/ControlNodeActionsService.java | 11 ++++ .../actions/partial-export-segmentation.hdb | 45 +++++++++++++++ .../templates/control/node/node-actions.hdb | 1 + .../control/node/partial-node-nav.hdb | 1 + 11 files changed, 150 insertions(+), 8 deletions(-) create mode 100644 code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java create mode 100644 code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb diff --git a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java index a3286a1b..e12fa0d3 100644 --- a/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java +++ b/code/execution/api/java/nu/marginalia/executor/client/ExecutorExportClient.java @@ -2,10 +2,7 @@ package nu.marginalia.executor.client; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.functions.execution.api.Empty; -import nu.marginalia.functions.execution.api.ExecutorExportApiGrpc; -import nu.marginalia.functions.execution.api.RpcExportSampleData; -import nu.marginalia.functions.execution.api.RpcFileStorageId; +import nu.marginalia.functions.execution.api.*; import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcMultiNodeChannelPool; import nu.marginalia.service.discovery.property.ServiceKey; @@ -55,6 +52,7 @@ public class ExecutorExportClient { .setFileStorageId(fid.id()) .build()); } + public void exportTermFrequencies(int node, FileStorageId fid) { channelPool.call(ExecutorExportApiBlockingStub::exportTermFrequencies) .forNode(node) @@ -69,6 +67,14 @@ public class ExecutorExportClient { .run(Empty.getDefaultInstance()); } + public void exportSegmentationModel(int node, String path) { + channelPool.call(ExecutorExportApiBlockingStub::exportSegmentationModel) + .forNode(node) + .run(RpcExportSegmentationModel + .newBuilder() + .setSourcePath(path) + .build()); + } } diff --git a/code/execution/api/src/main/protobuf/executor-api.proto b/code/execution/api/src/main/protobuf/executor-api.proto index 31cffe9b..565770ac 100644 --- a/code/execution/api/src/main/protobuf/executor-api.proto +++ b/code/execution/api/src/main/protobuf/executor-api.proto @@ -38,6 +38,7 @@ service ExecutorSideloadApi { service ExecutorExportApi { rpc exportAtags(RpcFileStorageId) returns (Empty) {} + rpc exportSegmentationModel(RpcExportSegmentationModel) returns (Empty) {} rpc exportSampleData(RpcExportSampleData) returns (Empty) {} rpc exportRssFeeds(RpcFileStorageId) returns (Empty) {} rpc exportTermFrequencies(RpcFileStorageId) returns (Empty) {} @@ -61,6 +62,9 @@ message RpcSideloadEncyclopedia { string sourcePath = 1; string baseUrl = 2; } +message RpcExportSegmentationModel { + string sourcePath = 1; +} message RpcSideloadDirtree { string sourcePath = 1; } diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 74449214..040a428b 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -34,6 +34,7 @@ dependencies { implementation project(':code:libraries:message-queue') implementation project(':code:functions:link-graph:api') + implementation project(':code:functions:search-query') implementation project(':code:execution:api') implementation project(':code:process-models:crawl-spec') diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActor.java b/code/execution/java/nu/marginalia/actor/ExecutorActor.java index ee7fb1d3..d04b3eaa 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActor.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActor.java @@ -12,6 +12,7 @@ public enum ExecutorActor { ADJACENCY_CALCULATION, CRAWL_JOB_EXTRACTOR, EXPORT_DATA, + EXPORT_SEGMENTATION_MODEL, EXPORT_ATAGS, EXPORT_TERM_FREQUENCIES, EXPORT_FEEDS, diff --git a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java index 53abdfe3..6f37d7ab 100644 --- a/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java +++ b/code/execution/java/nu/marginalia/actor/ExecutorActorControlService.java @@ -47,6 +47,7 @@ public class ExecutorActorControlService { ExportFeedsActor exportFeedsActor, ExportSampleDataActor exportSampleDataActor, ExportTermFreqActor exportTermFrequenciesActor, + ExportSegmentationModelActor exportSegmentationModelActor, DownloadSampleActor downloadSampleActor, ExecutorActorStateMachines stateMachines) { this.messageQueueFactory = messageQueueFactory; @@ -76,6 +77,7 @@ public class ExecutorActorControlService { register(ExecutorActor.EXPORT_FEEDS, exportFeedsActor); register(ExecutorActor.EXPORT_SAMPLE_DATA, exportSampleDataActor); register(ExecutorActor.EXPORT_TERM_FREQUENCIES, exportTermFrequenciesActor); + register(ExecutorActor.EXPORT_SEGMENTATION_MODEL, exportSegmentationModelActor); register(ExecutorActor.DOWNLOAD_SAMPLE, downloadSampleActor); } diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java new file mode 100644 index 00000000..4cc4ca76 --- /dev/null +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -0,0 +1,55 @@ +package nu.marginalia.actor.task; + +import com.google.gson.Gson; +import com.google.inject.Inject; +import com.google.inject.Singleton; +import nu.marginalia.actor.prototype.RecordActorPrototype; +import nu.marginalia.actor.state.ActorStep; +import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Path; +import java.time.LocalDateTime; + +@Singleton +public class ExportSegmentationModelActor extends RecordActorPrototype { + + private final FileStorageService storageService; + private final Logger logger = LoggerFactory.getLogger(getClass()); + + public record Export(String zimFile) implements ActorStep {} + @Override + public ActorStep transition(ActorStep self) throws Exception { + return switch(self) { + case Export(String zimFile) -> { + + var storage = storageService.allocateStorage(FileStorageType.EXPORT, "segmentation-model", "Segmentation Model Export " + LocalDateTime.now()); + + Path countsFile = storage.asPath().resolve("ngram-counts.bin"); + Path permutationsFile = storage.asPath().resolve("ngram-permutations.bin"); + + NgramExtractorMain.dumpCounts(Path.of(zimFile), countsFile, permutationsFile); + + yield new End(); + } + default -> new Error(); + }; + } + + @Override + public String describe() { + return "Generate a query segmentation model from a ZIM file."; + } + + @Inject + public ExportSegmentationModelActor(Gson gson, + FileStorageService storageService) + { + super(gson); + this.storageService = storageService; + } + +} diff --git a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java index 41c8bb8b..68ad426a 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java @@ -6,12 +6,11 @@ import io.grpc.stub.StreamObserver; import nu.marginalia.actor.ExecutorActor; import nu.marginalia.actor.ExecutorActorControlService; import nu.marginalia.actor.task.*; -import nu.marginalia.functions.execution.api.Empty; -import nu.marginalia.functions.execution.api.ExecutorExportApiGrpc; -import nu.marginalia.functions.execution.api.RpcExportSampleData; -import nu.marginalia.functions.execution.api.RpcFileStorageId; +import nu.marginalia.functions.execution.api.*; import nu.marginalia.storage.model.FileStorageId; +import java.nio.file.Path; + @Singleton public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase { private final ExecutorActorControlService actorControlService; @@ -92,4 +91,20 @@ public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExp responseObserver.onError(e); } } + + @Override + public void exportSegmentationModel(RpcExportSegmentationModel request, StreamObserver responseObserver) { + try { + actorControlService.startFrom(ExecutorActor.EXPORT_SEGMENTATION_MODEL, + new ExportSegmentationModelActor.Export(request.getSourcePath()) + ); + + responseObserver.onNext(Empty.getDefaultInstance()); + responseObserver.onCompleted(); + } + catch (Exception e) { + responseObserver.onError(e); + } + } + } diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java index 2ae09234..b711be14 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeActionsService.java @@ -76,6 +76,9 @@ public class ControlNodeActionsService { Spark.post("/public/nodes/:node/actions/sideload-stackexchange", this::sideloadStackexchange, redirectControl.renderRedirectAcknowledgement("Sideloading", "..") ); + Spark.post("/public/nodes/:node/actions/export-segmentation", this::exportSegmentationModel, + redirectControl.renderRedirectAcknowledgement("Exporting", "..") + ); Spark.post("/public/nodes/:node/actions/download-sample-data", this::downloadSampleData, redirectControl.renderRedirectAcknowledgement("Downloading", "..") ); @@ -307,6 +310,14 @@ public class ControlNodeActionsService { return ""; } + private Object exportSegmentationModel(Request req, Response rsp) { + exportClient.exportSegmentationModel( + Integer.parseInt(req.params("node")), + req.queryParams("source")); + + return ""; + } + private Object exportFromCrawlData(Request req, Response rsp) { String exportType = req.queryParams("exportType"); FileStorageId source = parseSourceFileStorageId(req.queryParams("source")); diff --git a/code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb b/code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb new file mode 100644 index 00000000..2ef9b180 --- /dev/null +++ b/code/services-core/control-service/resources/templates/control/node/actions/partial-export-segmentation.hdb @@ -0,0 +1,45 @@ +

Export segmentation model

+ +
+

This will generate a query segmentation model from a wikipedia ZIM file. A query segmentation model +is used to break a search query into segments corresponding to different concepts. For example, the query +"slackware linux package manager" would be segmented into "slackware linux", and "package manager"; and the +search would be performed putting higher emphasis on "package" and "manager" appearing in the same part of the document +than "linux" and "manager". +

+
+
+
+ + + {{#each uploadDirContents.items}} + + + + + + + {{/each}} + {{#unless uploadDirContents.items}} + + + + {{/unless}} +
FilenameSizeLast Modified
+ + {{#unless directory}}{{size}}{{/unless}}{{shortTimestamp lastModifiedTime}}
Nothing found in upload directory
+ +

+ + The upload directory is typically mounted to /uploads on the server. The external + directory is typically something like index-{{node.id}}/uploads. + +

+ +
+
+ +
+
+
+
\ No newline at end of file diff --git a/code/services-core/control-service/resources/templates/control/node/node-actions.hdb b/code/services-core/control-service/resources/templates/control/node/node-actions.hdb index df8ed77f..7de90949 100644 --- a/code/services-core/control-service/resources/templates/control/node/node-actions.hdb +++ b/code/services-core/control-service/resources/templates/control/node/node-actions.hdb @@ -20,6 +20,7 @@ {{#if view.sideload-warc}} {{> control/node/actions/partial-sideload-warc }} {{/if}} {{#if view.sideload-dirtree}} {{> control/node/actions/partial-sideload-dirtree }} {{/if}} {{#if view.sideload-reddit}} {{> control/node/actions/partial-sideload-reddit }} {{/if}} + {{#if view.export-segmentation}} {{> control/node/actions/partial-export-segmentation }} {{/if}} {{#if view.export-db-data}} {{> control/node/actions/partial-export-db-data }} {{/if}} {{#if view.export-from-crawl-data}} {{> control/node/actions/partial-export-from-crawl-data }} {{/if}} {{#if view.export-sample-data}} {{> control/node/actions/partial-export-sample-data }} {{/if}} diff --git a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb index 23627155..ff16507d 100644 --- a/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb +++ b/code/services-core/control-service/resources/templates/control/node/partial-node-nav.hdb @@ -30,6 +30,7 @@
  • Export Database Data
  • Export Sample Crawl Data
  • Export From Crawl Data
  • +
  • Export Segmentation Model
  • Restore Index Backup
  • From 3c75057dcd589da46ea194d782bd9b6a6a463f70 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 19 Mar 2024 10:33:29 +0100 Subject: [PATCH 08/90] (qs) Retire NGramBloomFilter, integrate new segmentation model instead --- .../java/nu/marginalia/LanguageModels.java | 10 +-- .../config/java/nu/marginalia/WmsaHome.java | 5 +- .../execution/ExecutorExportGrpcService.java | 2 - .../marginalia/util/TestLanguageModels.java | 4 +- .../test/util/TestLanguageModels.java | 4 +- .../query_parser/variant/QueryExpansion.java | 6 +- .../variant/strategy/CombineDashes.java | 40 ----------- .../variant/strategy/JoinTerms.java | 58 ---------------- .../variant/strategy/SplitWordNum.java | 65 ----------------- .../segmentation/NgramLexicon.java | 15 ++++ .../marginalia/util/ngrams/DenseBitMap.java | 69 ------------------- .../util/ngrams/NGramBloomFilter.java | 64 ----------------- .../query/svc/QueryFactoryTest.java | 1 - .../language/filter/TestLanguageModels.java | 4 +- .../converting/util/TestLanguageModels.java | 4 +- .../marginalia/util/TestLanguageModels.java | 4 +- run/setup.sh | 2 +- 17 files changed, 39 insertions(+), 318 deletions(-) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java delete mode 100644 code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java delete mode 100644 code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index 04ab0aa0..ca7fde45 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -3,7 +3,6 @@ package nu.marginalia; import java.nio.file.Path; public class LanguageModels { - public final Path ngramBloomFilter; public final Path termFrequencies; public final Path openNLPSentenceDetectionData; @@ -11,20 +10,21 @@ public class LanguageModels { public final Path posDict; public final Path openNLPTokenData; public final Path fasttextLanguageModel; + public final Path segments; - public LanguageModels(Path ngramBloomFilter, - Path termFrequencies, + public LanguageModels(Path termFrequencies, Path openNLPSentenceDetectionData, Path posRules, Path posDict, Path openNLPTokenData, - Path fasttextLanguageModel) { - this.ngramBloomFilter = ngramBloomFilter; + Path fasttextLanguageModel, + Path segments) { this.termFrequencies = termFrequencies; this.openNLPSentenceDetectionData = openNLPSentenceDetectionData; this.posRules = posRules; this.posDict = posDict; this.openNLPTokenData = openNLPTokenData; this.fasttextLanguageModel = fasttextLanguageModel; + this.segments = segments; } } diff --git a/code/common/config/java/nu/marginalia/WmsaHome.java b/code/common/config/java/nu/marginalia/WmsaHome.java index eff2e1c4..91fe49d4 100644 --- a/code/common/config/java/nu/marginalia/WmsaHome.java +++ b/code/common/config/java/nu/marginalia/WmsaHome.java @@ -96,13 +96,14 @@ public class WmsaHome { final Path home = getHomePath(); return new LanguageModels( - home.resolve("model/ngrams.bin"), home.resolve("model/tfreq-new-algo3.bin"), home.resolve("model/opennlp-sentence.bin"), home.resolve("model/English.RDR"), home.resolve("model/English.DICT"), home.resolve("model/opennlp-tok.bin"), - home.resolve("model/lid.176.ftz")); + home.resolve("model/lid.176.ftz"), + home.resolve("model/segments.bin") + ); } public static Path getAtagsPath() { diff --git a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java index 68ad426a..3c5a8d5b 100644 --- a/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java +++ b/code/execution/java/nu/marginalia/execution/ExecutorExportGrpcService.java @@ -9,8 +9,6 @@ import nu.marginalia.actor.task.*; import nu.marginalia.functions.execution.api.*; import nu.marginalia.storage.model.FileStorageId; -import java.nio.file.Path; - @Singleton public class ExecutorExportGrpcService extends ExecutorExportApiGrpc.ExecutorExportApiImplBase { private final ExecutorActorControlService actorControlService; diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java b/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java index 5efd2025..a4cc012b 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java +++ b/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java b/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java index 0675559a..d857c048 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java index faac81d4..eac2988d 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -1,6 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser.variant; import ca.rmen.porterstemmer.PorterStemmer; +import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; @@ -15,13 +16,15 @@ public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); private final TermFrequencyDict dict; private final NgramLexicon lexicon; - List expansionStrategies = List.of( + + private final List expansionStrategies = List.of( this::joinDashes, this::splitWordNum, this::joinTerms, this::createSegments ); + @Inject public QueryExpansion(TermFrequencyDict dict, NgramLexicon lexicon ) { @@ -97,6 +100,7 @@ public class QueryExpansion { String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { for (var segment : lexicon.findSegments(length, words)) { int start = segment.start(); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java deleted file mode 100644 index c24defbe..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/CombineDashes.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** Variant strategy that combines word that have dashes, as sometimes lawn-chair - * gets spelled lawnchair */ -public class CombineDashes implements VariantStrategy { - final Pattern dashBoundary = Pattern.compile("-"); - - public CombineDashes() { - } - - @Override - public Collection> constructVariants(List words) { - List asTokens2 = new ArrayList<>(); - boolean dash = false; - - for (var span : words) { - var matcher = dashBoundary.matcher(span.word); - if (matcher.find()) { - String combined = dashBoundary.matcher(span.word).replaceAll(""); - asTokens2.add(combined); - } - - asTokens2.add(span.word); - } - - if (dash) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java deleted file mode 100644 index d03a64d1..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/JoinTerms.java +++ /dev/null @@ -1,58 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import ca.rmen.porterstemmer.PorterStemmer; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -/** Variant strategy that merges tokens that are adjacent, where the combined token - * has a high term frequency. That way we match 'lawnchair' with 'lawn chair' */ -public class JoinTerms implements VariantStrategy { - private final TermFrequencyDict dict; - private final PorterStemmer ps; - - public JoinTerms(TermFrequencyDict dict, PorterStemmer ps) { - this.dict = dict; - this.ps = ps; - } - - @Override - public Collection> constructVariants(List span) { - List> ret = new ArrayList<>(); - - for (int i = 0; i < span.size()-1; i++) { - var a = span.get(i); - var b = span.get(i+1); - - var stemmed = ps.stemWord(a.word + b.word); - - double scoreCombo = dict.getTermFreqStemmed(stemmed); - - if (scoreCombo > 10000) { - List asTokens = new ArrayList<>(); - - for (int j = 0; j < i; j++) { - var word = span.get(j).word; - asTokens.add(word); - } - { - var word = a.word + b.word; - asTokens.add(word); - } - for (int j = i+2; j < span.size(); j++) { - var word = span.get(j).word; - asTokens.add(word); - } - - ret.add(asTokens); - } - - } - - return ret; - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java deleted file mode 100644 index ac79476b..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/strategy/SplitWordNum.java +++ /dev/null @@ -1,65 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.strategy; - -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.functions.searchquery.query_parser.variant.VariantStrategy; -import nu.marginalia.util.ngrams.NGramBloomFilter; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.List; -import java.util.regex.Pattern; - -/** Variant strategy that splits tokens at the boundary between a number and a word. - */ -public class SplitWordNum implements VariantStrategy { - - - final Pattern numWordBoundary = Pattern.compile("[0-9][a-zA-Z]|[a-zA-Z][0-9]"); - private final NGramBloomFilter nGramBloomFilter; - - public SplitWordNum(NGramBloomFilter nGramBloomFilter) { - this.nGramBloomFilter = nGramBloomFilter; - } - - @Override - public Collection> constructVariants(List ls) { - List asTokens2 = new ArrayList<>(); - - boolean num = false; - - for (var span : ls) { - var wordMatcher = numWordBoundary.matcher(span.word); - var stemmedMatcher = numWordBoundary.matcher(span.stemmed); - - int ws = 0; - int ss = 0; - boolean didSplit = false; - while (wordMatcher.find(ws) && stemmedMatcher.find(ss)) { - ws = wordMatcher.start()+1; - ss = stemmedMatcher.start()+1; - if (nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "_")) - || nGramBloomFilter.isKnownNGram(splitAtNumBoundary(span.word, stemmedMatcher.start(), "-"))) - { - String combined = splitAtNumBoundary(span.word, wordMatcher.start(), "_"); - asTokens2.add(combined); - didSplit = true; - num = true; - } - } - - if (!didSplit) { - asTokens2.add(span.word); - } - } - - if (num) { - return List.of(asTokens2); - } - return Collections.emptyList(); - } - - private String splitAtNumBoundary(String in, int splitPoint, String joiner) { - return in.substring(0, splitPoint+1) + joiner + in.substring(splitPoint+1); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java index f8044e12..c4fe69e2 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java @@ -1,8 +1,10 @@ package nu.marginalia.functions.searchquery.segmentation; +import com.google.inject.Inject; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import nu.marginalia.LanguageModels; import java.io.DataInputStream; import java.io.DataOutputStream; @@ -24,6 +26,19 @@ public class NgramLexicon { private static final HasherGroup orderedHasher = HasherGroup.ordered(); private static final HasherGroup unorderedHasher = HasherGroup.unordered(); + @Inject + public NgramLexicon(LanguageModels models) { + try { + loadCounts(models.segments); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public NgramLexicon() { + + } + public List findSegments(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); diff --git a/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java b/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java deleted file mode 100644 index 008b17b3..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/ngrams/DenseBitMap.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.util.ngrams; - -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.BitSet; - -// It's unclear why this exists, we should probably use a BitSet instead? -// Chesterton's fence? -public class DenseBitMap { - public static final long MAX_CAPACITY_2GB_16BN_ITEMS=(1L<<34)-8; - - public final long cardinality; - private final ByteBuffer buffer; - - public DenseBitMap(long cardinality) { - this.cardinality = cardinality; - - boolean misaligned = (cardinality & 7) > 0; - this.buffer = ByteBuffer.allocateDirect((int)((cardinality / 8) + (misaligned ? 1 : 0))); - } - - public static DenseBitMap loadFromFile(Path file) throws IOException { - long size = Files.size(file); - var dbm = new DenseBitMap(size/8); - - try (var bc = Files.newByteChannel(file)) { - while (dbm.buffer.position() < dbm.buffer.capacity()) { - bc.read(dbm.buffer); - } - } - dbm.buffer.clear(); - - return dbm; - } - - public void writeToFile(Path file) throws IOException { - - try (var bc = Files.newByteChannel(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE)) { - while (buffer.position() < buffer.capacity()) { - bc.write(buffer); - } - } - - buffer.clear(); - } - - public boolean get(long pos) { - return (buffer.get((int)(pos >>> 3)) & ((byte)1 << (int)(pos & 7))) != 0; - } - - /** Set the bit indexed by pos, returns - * its previous value. - */ - public boolean set(long pos) { - int offset = (int) (pos >>> 3); - int oldVal = buffer.get(offset); - int mask = (byte) 1 << (int) (pos & 7); - buffer.put(offset, (byte) (oldVal | mask)); - return (oldVal & mask) != 0; - } - - public void clear(long pos) { - int offset = (int)(pos >>> 3); - buffer.put(offset, (byte)(buffer.get(offset) & ~(byte)(1 << (int)(pos & 7)))); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java b/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java deleted file mode 100644 index 3326956d..00000000 --- a/code/functions/search-query/java/nu/marginalia/util/ngrams/NGramBloomFilter.java +++ /dev/null @@ -1,64 +0,0 @@ -package nu.marginalia.util.ngrams; - -import ca.rmen.porterstemmer.PorterStemmer; -import com.google.common.hash.HashFunction; -import com.google.common.hash.Hashing; -import com.google.inject.Inject; -import nu.marginalia.LanguageModels; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.regex.Pattern; - -public class NGramBloomFilter { - private final DenseBitMap bitMap; - private static final PorterStemmer ps = new PorterStemmer(); - private static final HashFunction hasher = Hashing.murmur3_128(0); - - private static final Logger logger = LoggerFactory.getLogger(NGramBloomFilter.class); - - @Inject - public NGramBloomFilter(LanguageModels lm) throws IOException { - this(loadSafely(lm.ngramBloomFilter)); - } - - private static DenseBitMap loadSafely(Path path) throws IOException { - if (Files.isRegularFile(path)) { - return DenseBitMap.loadFromFile(path); - } - else { - logger.warn("NGrams file missing " + path); - return new DenseBitMap(1); - } - } - - public NGramBloomFilter(DenseBitMap bitMap) { - this.bitMap = bitMap; - } - - public boolean isKnownNGram(String word) { - long bit = bitForWord(word, bitMap.cardinality); - - return bitMap.get(bit); - } - - public static NGramBloomFilter load(Path file) throws IOException { - return new NGramBloomFilter(DenseBitMap.loadFromFile(file)); - } - - private static final Pattern underscore = Pattern.compile("_"); - - private static long bitForWord(String s, long n) { - String[] parts = underscore.split(s); - long hc = 0; - for (String part : parts) { - hc = hc * 31 + hasher.hashString(ps.stemWord(part), StandardCharsets.UTF_8).padToLong(); - } - return (hc & 0x7FFF_FFFF_FFFF_FFFFL) % n; - } - -} diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 4020d6e0..24131143 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -9,7 +9,6 @@ import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.util.language.EnglishDictionary; -import nu.marginalia.util.ngrams.NGramBloomFilter; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; diff --git a/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java b/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java index 2b7bf0e2..cb31942a 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/filter/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java b/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java index 4ad1e430..f28e1348 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java b/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java index 5efd2025..a4cc012b 100644 --- a/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java +++ b/code/services-application/search-service/test/nu/marginalia/util/TestLanguageModels.java @@ -26,13 +26,13 @@ public class TestLanguageModels { var languageModelsHome = getLanguageModelsPath(); return new LanguageModels( - languageModelsHome.resolve("ngrams.bin"), languageModelsHome.resolve("tfreq-new-algo3.bin"), languageModelsHome.resolve("opennlp-sentence.bin"), languageModelsHome.resolve("English.RDR"), languageModelsHome.resolve("English.DICT"), languageModelsHome.resolve("opennlp-tokens.bin"), - languageModelsHome.resolve("lid.176.ftz") + languageModelsHome.resolve("lid.176.ftz"), + languageModelsHome.resolve("segments.bin") ); } } diff --git a/run/setup.sh b/run/setup.sh index 3d9c5f54..3cacca75 100755 --- a/run/setup.sh +++ b/run/setup.sh @@ -26,7 +26,7 @@ download_model model/English.DICT https://raw.githubusercontent.com/datquocnguye download_model model/English.RDR https://raw.githubusercontent.com/datquocnguyen/RDRPOSTagger/master/Models/POS/English.RDR download_model model/opennlp-sentence.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-sentence-1.0-1.9.3.bin download_model model/opennlp-tokens.bin https://mirrors.estointernet.in/apache/opennlp/models/ud-models-1.0/opennlp-en-ud-ewt-tokens-1.0-1.9.3.bin -download_model model/ngrams.bin https://downloads.marginalia.nu/model/ngrams.bin +download_model model/segments.bin https://downloads.marginalia.nu/model/segments.bin download_model model/tfreq-new-algo3.bin https://downloads.marginalia.nu/model/tfreq-new-algo3.bin download_model model/lid.176.ftz https://downloads.marginalia.nu/model/lid.176.ftz From 6a7a7009c7ec74f70d40846b6674d822937a0e6d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 19 Mar 2024 14:28:42 +0100 Subject: [PATCH 09/90] (convert) Initial integration of segmentation data into the converter's keyword extraction logic --- .../java/nu/marginalia/LanguageModels.java | 3 + code/execution/build.gradle | 1 + .../task/ExportSegmentationModelActor.java | 2 +- .../anchor-keywords/build.gradle | 1 + .../atags/DomainAnchorTagsImplTest.java | 1 + .../keyword/DocumentKeywordExtractor.java | 16 ++++- .../extractors/KeywordPositionBitmask.java | 12 +++- .../keyword/DocumentKeywordExtractorTest.java | 25 ++++++- .../keyword/SentenceExtractorTest.java | 10 +-- .../summary/SummaryExtractorTest.java | 5 +- .../query_parser/variant/QueryExpansion.java | 2 +- .../language/model/DocumentSentence.java | 17 ++++- .../language/sentence/SentenceExtractor.java | 68 +++++++++++++++++-- .../term-frequency-dict/build.gradle | 2 + .../segmentation/BasicSentenceExtractor.java | 2 +- .../marginalia}/segmentation/HasherGroup.java | 4 +- .../segmentation/NgramExporterMain.java | 14 ++-- .../segmentation/NgramExtractorMain.java | 2 +- .../segmentation/NgramLexicon.java | 43 +++++++----- .../segmentation/HasherGroupTest.java | 3 +- .../segmentation/NgramLexiconTest.java | 2 +- .../SentenceStatisticsExperiment.java | 5 +- 22 files changed, 192 insertions(+), 48 deletions(-) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/BasicSentenceExtractor.java (88%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/HasherGroup.java (95%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/NgramExporterMain.java (72%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/NgramExtractorMain.java (98%) rename code/{functions/search-query/java/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/java/nu/marginalia}/segmentation/NgramLexicon.java (85%) rename code/{functions/search-query/test/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/test/nu/marginalia}/segmentation/HasherGroupTest.java (89%) rename code/{functions/search-query/test/nu/marginalia/functions/searchquery => libraries/term-frequency-dict/test/nu/marginalia}/segmentation/NgramLexiconTest.java (96%) diff --git a/code/common/config/java/nu/marginalia/LanguageModels.java b/code/common/config/java/nu/marginalia/LanguageModels.java index ca7fde45..d1854963 100644 --- a/code/common/config/java/nu/marginalia/LanguageModels.java +++ b/code/common/config/java/nu/marginalia/LanguageModels.java @@ -1,7 +1,10 @@ package nu.marginalia; +import lombok.Builder; + import java.nio.file.Path; +@Builder public class LanguageModels { public final Path termFrequencies; diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 040a428b..3824a8c1 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -32,6 +32,7 @@ dependencies { implementation project(':third-party:commons-codec') implementation project(':code:libraries:message-queue') + implementation project(':code:libraries:term-frequency-dict') implementation project(':code:functions:link-graph:api') implementation project(':code:functions:search-query') diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java index 4cc4ca76..90baf009 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorStep; -import nu.marginalia.functions.searchquery.segmentation.NgramExtractorMain; +import nu.marginalia.segmentation.NgramExtractorMain; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageType; import org.slf4j.Logger; diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle index 880ce467..ae92b066 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/features-convert/anchor-keywords/build.gradle @@ -19,6 +19,7 @@ dependencies { implementation project(':code:common:process') implementation project(':code:features-convert:keyword-extraction') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:term-frequency-dict') implementation libs.bundles.slf4j diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java b/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java index ee555ca5..17443c51 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java +++ b/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java @@ -5,6 +5,7 @@ import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.util.TestLanguageModels; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java index 8feb5fd8..aaad9800 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -1,5 +1,6 @@ package nu.marginalia.keyword; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.keyword.extractors.*; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.language.model.DocumentLanguageData; @@ -15,11 +16,13 @@ public class DocumentKeywordExtractor { private final KeywordExtractor keywordExtractor; private final TermFrequencyDict dict; + private final NgramLexicon ngramLexicon; @Inject - public DocumentKeywordExtractor(TermFrequencyDict dict) { + public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) { this.dict = dict; + this.ngramLexicon = ngramLexicon; this.keywordExtractor = new KeywordExtractor(); } @@ -131,6 +134,17 @@ public class DocumentKeywordExtractor { wordsBuilder.add(rep.word, meta); } + + for (int i = 0; i < sent.ngrams.length; i++) { + var ngram = sent.ngrams[i]; + var ngramStemmed = sent.ngramStemmed[i]; + + long meta = metadata.getMetadataForWord(ngramStemmed); + assert meta != 0L : "Missing meta for " + ngram; + + wordsBuilder.add(ngram, meta); + } + } } diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java index b402c9f6..230c895f 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java @@ -14,7 +14,9 @@ public class KeywordPositionBitmask { private static final int unmodulatedPortion = 16; @Inject - public KeywordPositionBitmask(KeywordExtractor keywordExtractor, DocumentLanguageData dld) { + public KeywordPositionBitmask(KeywordExtractor keywordExtractor, + DocumentLanguageData dld) + { // Mark the title words as position 0 for (var sent : dld.titleSentences) { @@ -24,6 +26,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var ngram : sent.ngramStemmed) { + positionMask.merge(ngram, posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } @@ -43,6 +49,10 @@ public class KeywordPositionBitmask { positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); } + for (var ngram : sent.ngramStemmed) { + positionMask.merge(ngram, posBit, this::bitwiseOr); + } + for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); } diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java index 8a4f3b6b..54577f80 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -5,6 +5,7 @@ import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; import org.junit.jupiter.api.Assertions; @@ -20,7 +21,9 @@ import java.util.Set; class DocumentKeywordExtractorTest { - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); @Test @@ -56,6 +59,22 @@ class DocumentKeywordExtractorTest { } + @Test + public void testKeyboards2() throws IOException, URISyntaxException { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0.5)); + + var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); + + keywords.getWords().forEach((k, v) -> { + if (k.contains("_")) { + System.out.println(k + " " + new WordMetadata(v)); + } + }); + } @Test public void testKeyboards() throws IOException, URISyntaxException { var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), @@ -119,7 +138,9 @@ class DocumentKeywordExtractorTest { var doc = Jsoup.parse(html); doc.filter(new DomPruningFilter(0.5)); - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index dabad6d1..bfc78a9c 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -3,6 +3,7 @@ package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.WmsaHome; import nu.marginalia.model.EdgeUrl; @@ -20,9 +21,10 @@ import static org.junit.jupiter.api.Assertions.assertEquals; @Tag("slow") class SentenceExtractorTest { - final LanguageModels lm = TestLanguageModels.getLanguageModels(); + static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - SentenceExtractor se = new SentenceExtractor(lm); + static NgramLexicon ngramLexicon = new NgramLexicon(lm); + static SentenceExtractor se = new SentenceExtractor(lm); @SneakyThrows public static void main(String... args) throws IOException { @@ -32,11 +34,9 @@ class SentenceExtractorTest { System.out.println("Running"); - SentenceExtractor se = new SentenceExtractor(lm); - var dict = new TermFrequencyDict(lm); var url = new EdgeUrl("https://memex.marginalia.nu/"); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon); for (;;) { long total = 0; diff --git a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java index c1a326da..cabe558f 100644 --- a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java @@ -5,6 +5,7 @@ import nu.marginalia.WmsaHome; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.summary.heuristic.*; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; @@ -25,7 +26,9 @@ class SummaryExtractorTest { @BeforeEach public void setUp() { - keywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + keywordExtractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), + new NgramLexicon(WmsaHome.getLanguageModels())); setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); summaryExtractor = new SummaryExtractor(255, diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java index eac2988d..820a9022 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java @@ -4,7 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; -import nu.marginalia.functions.searchquery.segmentation.NgramLexicon; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java index ef5bc0a9..b9b4abce 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java @@ -16,12 +16,24 @@ public class DocumentSentence implements Iterable{ public final String[] wordsLowerCase; public final String[] posTags; public final String[] stemmedWords; + public final String[] ngrams; + public final String[] ngramStemmed; private final BitSet isStopWord; + public SoftReference keywords; - public DocumentSentence(String originalSentence, String[] words, int[] separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords) { + public DocumentSentence(String originalSentence, + String[] words, + int[] separators, + String[] wordsLowerCase, + String[] posTags, + String[] stemmedWords, + String[] ngrams, + String[] ngramsStemmed + ) + { this.originalSentence = originalSentence; this.words = words; this.separators = separators; @@ -31,6 +43,9 @@ public class DocumentSentence implements Iterable{ isStopWord = new BitSet(words.length); + this.ngrams = ngrams; + this.ngramStemmed = ngramsStemmed; + for (int i = 0; i < words.length; i++) { if (WordPatterns.isStopWord(words[i])) isStopWord.set(i); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index 13ba2e76..fd15660f 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -4,6 +4,7 @@ import com.github.datquocnguyen.RDRPOSTagger; import gnu.trove.map.hash.TObjectIntHashMap; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import opennlp.tools.sentdetect.SentenceDetectorME; @@ -32,6 +33,8 @@ public class SentenceExtractor { private SentenceDetectorME sentenceDetector; private static RDRPOSTagger rdrposTagger; + private static NgramLexicon ngramLexicon = null; + private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); @@ -45,7 +48,8 @@ public class SentenceExtractor { private static final int MAX_TEXT_LENGTH = 65536; @SneakyThrows @Inject - public SentenceExtractor(LanguageModels models) { + public SentenceExtractor(LanguageModels models) + { try (InputStream modelIn = new FileInputStream(models.openNLPSentenceDetectionData.toFile())) { var sentenceModel = new SentenceModel(modelIn); sentenceDetector = new SentenceDetectorME(sentenceModel); @@ -55,7 +59,9 @@ public class SentenceExtractor { logger.error("Could not initialize sentence detector", ex); } - synchronized (RDRPOSTagger.class) { + synchronized (this) { + ngramLexicon = new NgramLexicon(models); + try { rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); } @@ -128,8 +134,34 @@ public class SentenceExtractor { var seps = wordsAndSeps.separators; var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); + List ngrams = ngramLexicon.findSegmentsStrings(2, 12, words); + + String[] ngramsWords = new String[ngrams.size()]; + String[] ngramsStemmedWords = new String[ngrams.size()]; + for (int i = 0; i < ngrams.size(); i++) { + String[] ngram = ngrams.get(i); + + StringJoiner ngramJoiner = new StringJoiner("_"); + StringJoiner stemmedJoiner = new StringJoiner("_"); + for (String s : ngram) { + ngramJoiner.add(s); + stemmedJoiner.add(porterStemmer.stem(s)); + } + + ngramsWords[i] = ngramJoiner.toString(); + ngramsStemmedWords[i] = stemmedJoiner.toString(); + } + + return new DocumentSentence( - SentenceExtractorStringUtils.sanitizeString(text), words, seps, lc, rdrposTagger.tagsForEnSentence(words), stemSentence(lc) + SentenceExtractorStringUtils.sanitizeString(text), + words, + seps, + lc, + rdrposTagger.tagsForEnSentence(words), + stemSentence(lc), + ngramsWords, + ngramsStemmedWords ); } @@ -195,7 +227,35 @@ public class SentenceExtractor { fullString = ""; } - ret[i] = new DocumentSentence(fullString, tokens[i], separators[i], tokensLc[i], posTags[i], stemmedWords[i]); + List ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]); + + String[] ngramsWords = new String[ngrams.size()]; + String[] ngramsStemmedWords = new String[ngrams.size()]; + + for (int j = 0; j < ngrams.size(); j++) { + String[] ngram = ngrams.get(j); + + StringJoiner ngramJoiner = new StringJoiner("_"); + StringJoiner stemmedJoiner = new StringJoiner("_"); + for (String s : ngram) { + ngramJoiner.add(s); + stemmedJoiner.add(porterStemmer.stem(s)); + } + + ngramsWords[j] = ngramJoiner.toString(); + ngramsStemmedWords[j] = stemmedJoiner.toString(); + } + + + ret[i] = new DocumentSentence(fullString, + tokens[i], + separators[i], + tokensLc[i], + posTags[i], + stemmedWords[i], + ngramsWords, + ngramsStemmedWords + ); } return ret; } diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 901fd2e0..67fb44ae 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -16,6 +16,8 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':third-party:rdrpostagger') implementation project(':third-party:porterstemmer') + implementation project(':third-party:commons-codec') + implementation project(':third-party:openzim') implementation project(':third-party:monkey-patch-opennlp') implementation project(':code:common:model') implementation project(':code:common:config') diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java similarity index 88% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java index e65c243d..cee48910 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/BasicSentenceExtractor.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/BasicSentenceExtractor.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import ca.rmen.porterstemmer.PorterStemmer; import org.apache.commons.lang3.StringUtils; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java similarity index 95% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java index 60bbb4dd..2a452f75 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/HasherGroup.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/HasherGroup.java @@ -1,11 +1,11 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import nu.marginalia.hash.MurmurHash3_128; /** A group of hash functions that can be used to hash a sequence of strings, * that also has an inverse operation that can be used to remove a previously applied * string from the sequence. */ -sealed interface HasherGroup { +public sealed interface HasherGroup { /** Apply a hash to the accumulator */ long apply(long acc, long add); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java similarity index 72% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java index 087345f6..ee6d2cd5 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExporterMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java @@ -1,7 +1,6 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; -import nu.marginalia.WmsaHome; -import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.LanguageModels; import java.io.IOException; import java.nio.file.Path; @@ -15,10 +14,11 @@ public class NgramExporterMain { } static void trial() throws IOException { - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - - NgramLexicon lexicon = new NgramLexicon(); - lexicon.loadCounts(Path.of("/home/vlofgren/ngram-counts.bin")); + NgramLexicon lexicon = new NgramLexicon( + LanguageModels.builder() + .segments(Path.of("/home/vlofgren/ngram-counts.bin")) + .build() + ); System.out.println("Loaded!"); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java similarity index 98% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 4cd4b296..577aee6e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.hash.MurmurHash3_128; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java similarity index 85% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java rename to code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index c4fe69e2..91cee314 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -1,11 +1,13 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import com.google.inject.Inject; +import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.LanguageModels; +import java.io.BufferedInputStream; import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; @@ -16,11 +18,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; +@Singleton public class NgramLexicon { - private final Long2IntOpenCustomHashMap counts = new Long2IntOpenCustomHashMap( - 100_000_000, - new KeyIsAlreadyHashStrategy() - ); + private final Long2IntOpenCustomHashMap counts; private final LongOpenHashSet permutations = new LongOpenHashSet(); private static final HasherGroup orderedHasher = HasherGroup.ordered(); @@ -28,17 +28,35 @@ public class NgramLexicon { @Inject public NgramLexicon(LanguageModels models) { - try { - loadCounts(models.segments); + try (var dis = new DataInputStream(new BufferedInputStream(Files.newInputStream(models.segments)))) { + long size = dis.readInt(); + counts = new Long2IntOpenCustomHashMap( + (int) size, + new KeyIsAlreadyHashStrategy() + ); + + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } } catch (IOException e) { throw new RuntimeException(e); } } public NgramLexicon() { - + counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); } + public List findSegmentsStrings(int minLength, int maxLength, String... parts) { + List segments = new ArrayList<>(); + + for (int i = minLength; i <= maxLength; i++) { + segments.addAll(findSegments(i, parts)); + } + + return segments.stream().map(seg -> seg.project(parts)).toList(); + } + public List findSegments(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); @@ -96,15 +114,6 @@ public class NgramLexicon { permutations.add(hashUnordered); } - public void loadCounts(Path path) throws IOException { - try (var dis = new DataInputStream(Files.newInputStream(path))) { - long size = dis.readInt(); - - for (int i = 0; i < size; i++) { - counts.put(dis.readLong(), dis.readInt()); - } - } - } public void loadPermutations(Path path) throws IOException { try (var dis = new DataInputStream(Files.newInputStream(path))) { diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java similarity index 89% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java rename to code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java index 174bd553..110b1b9b 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/HasherGroupTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/HasherGroupTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; +import nu.marginalia.segmentation.HasherGroup; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java similarity index 96% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java rename to code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index 28b9ef2f..d5065959 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.segmentation; +package nu.marginalia.segmentation; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index 8614d1e6..dde7a106 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -8,6 +8,7 @@ import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; @@ -21,8 +22,10 @@ import java.nio.file.Path; public class SentenceStatisticsExperiment extends LegacyExperiment { + NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon); Path filename; PrintWriter writer; From 2253b556b26bd8abad660e00dde4a4fae6d12739 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 21 Mar 2024 12:00:52 +0100 Subject: [PATCH 10/90] WIP --- .../query_parser/ExpansionStrategy.java | 7 + .../{variant => }/QueryExpansion.java | 8 +- .../query_parser/QueryPermutation.java | 229 ------------------ .../query_parser/QueryVariants.java | 207 ---------------- .../{variant => }/model/QWord.java | 2 +- .../{variant => }/model/QWordGraph.java | 46 +++- .../variant/ExpansionStrategy.java | 7 - .../query_parser/variant/QueryVariant.java | 17 -- .../query_parser/variant/QueryVariantSet.java | 21 -- .../query_parser/variant/QueryWord.java | 10 - .../query_parser/variant/VariantStrategy.java | 8 - .../searchquery/svc/QueryFactory.java | 50 +--- .../{variant => }/model/QWordGraphTest.java | 6 +- 13 files changed, 68 insertions(+), 550 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/QueryExpansion.java (93%) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWord.java (94%) rename code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWordGraph.java (82%) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java rename code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/{variant => }/model/QWordGraphTest.java (83%) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java new file mode 100644 index 00000000..20ebffd1 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java @@ -0,0 +1,7 @@ +package nu.marginalia.functions.searchquery.query_parser; + +import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; + +public interface ExpansionStrategy { + void expand(QWordGraph graph); +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java similarity index 93% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 820a9022..c216918e 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -1,9 +1,9 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; +package nu.marginalia.functions.searchquery.query_parser; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWord; -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; +import nu.marginalia.functions.searchquery.query_parser.model.QWord; +import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; @@ -40,7 +40,7 @@ public class QueryExpansion { strategy.expand(graph); } - return null; + return graph; } private static final Pattern dashPattern = Pattern.compile("-"); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java deleted file mode 100644 index 417ceda3..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryPermutation.java +++ /dev/null @@ -1,229 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.language.WordPatterns; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; -import java.util.function.Predicate; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import static java.util.stream.Stream.concat; - -public class QueryPermutation { - private final Logger logger = LoggerFactory.getLogger(getClass()); - private final QueryVariants queryVariants; - - public static final Pattern wordPattern = Pattern.compile("[#]?[_@.a-zA-Z0-9'+\\-\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]+[#]?"); - public static final Pattern wordAppendixPattern = Pattern.compile("[.]?[0-9a-zA-Z\\u00C0-\\u00D6\\u00D8-\\u00f6\\u00f8-\\u00ff]{1,3}[0-9]?"); - - public static final Predicate wordQualitiesPredicate = wordPattern.asMatchPredicate(); - - public static final Predicate wordAppendixPredicate = wordAppendixPattern.asMatchPredicate(); - public static final Predicate wordPredicateEither = wordQualitiesPredicate.or(wordAppendixPredicate); - - public QueryPermutation(QueryVariants queryVariants) { - this.queryVariants = queryVariants; - } - - public List> permuteQueries(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start > 1) { - List> permuteParts = combineSearchTerms(items.subList(start, end)); - int s = start; - int e = end; - return permuteParts.stream().map(part -> - concat(items.subList(0, s).stream(), concat(part.stream(), items.subList(e, items.size()).stream())) - .collect(Collectors.toList())) - .peek(lst -> lst.removeIf(this::isJunkWord)) - .limit(24) - .collect(Collectors.toList()); - } - else { - return List.of(items); - } - } - - - public List> permuteQueriesNew(List items) { - int start = -1; - int end = items.size(); - - for (int i = 0; i < items.size(); i++) { - var token = items.get(i); - - if (start < 0) { - if (token.type == TokenType.LITERAL_TERM && wordQualitiesPredicate.test(token.str)) { - start = i; - } - } - else { - if (token.type != TokenType.LITERAL_TERM || !wordPredicateEither.test(token.str)) { - end = i; - break; - } - } - } - - if (start >= 0 && end - start >= 1) { - var result = queryVariants.getQueryVariants(items.subList(start, end)); - - logger.debug("{}", result); - - if (result.isEmpty()) { - logger.warn("Empty variants result, falling back on old code"); - return permuteQueries(items); - } - - List> queryVariants = new ArrayList<>(); - for (var query : result.faithful) { - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - for (var query : result.alternative) { - if (queryVariants.size() >= 6) - break; - - var tokens = query.terms.stream().map(term -> new Token(TokenType.LITERAL_TERM, term)).collect(Collectors.toList()); - tokens.addAll(result.nonLiterals); - - queryVariants.add(tokens); - } - - List> returnValue = new ArrayList<>(queryVariants.size()); - for (var variant: queryVariants) { - List r = new ArrayList<>(start + variant.size() + (items.size() - end)); - r.addAll(items.subList(0, start)); - r.addAll(variant); - r.addAll(items.subList(end, items.size())); - returnValue.add(r); - } - - return returnValue; - - } - else { - return List.of(items); - } - } - - private boolean isJunkWord(Token token) { - if (WordPatterns.isStopWord(token.str) && - !token.str.matches("^(\\d+|([a-z]+:.*))$")) { - return true; - } - return switch (token.str) { - case "vs", "versus", "or", "and" -> true; - default -> false; - }; - } - - private List> combineSearchTerms(List subList) { - int size = subList.size(); - if (size < 1) { - return Collections.emptyList(); - } - else if (size == 1) { - if (WordPatterns.isStopWord(subList.get(0).str)) { - return Collections.emptyList(); - } - return List.of(subList); - } - - List> results = new ArrayList<>(size*(size+1)/2); - - if (subList.size() <= 4 && subList.get(0).str.length() >= 2 && !isPrefixWord(subList.get(subList.size()-1).str)) { - results.add(List.of(joinTokens(subList))); - } - outer: for (int i = size - 1; i >= 1; i--) { - - var left = combineSearchTerms(subList.subList(0, i)); - var right = combineSearchTerms(subList.subList(i, size)); - - for (var l : left) { - if (results.size() > 48) { - break outer; - } - - for (var r : right) { - if (results.size() > 48) { - break outer; - } - - List combined = new ArrayList<>(l.size() + r.size()); - combined.addAll(l); - combined.addAll(r); - if (!results.contains(combined)) { - results.add(combined); - } - } - } - } - if (!results.contains(subList)) { - results.add(subList); - } - Comparator> tc = (o1, o2) -> { - int dJoininess = o2.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(joininess(s.str), 2)).sum(); - if (dJoininess == 0) { - return (o2.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum() - - o1.stream().mapToInt(s->(int)Math.pow(rightiness(s.str), 2)).sum()); - } - return (int) Math.signum(dJoininess); - }; - results.sort(tc); - return results; - } - - private boolean isPrefixWord(String str) { - return switch (str) { - case "the", "of", "when" -> true; - default -> false; - }; - } - - int joininess(String s) { - return (int) s.chars().filter(c -> c == '_').count(); - } - int rightiness(String s) { - int rightiness = 0; - for (int i = 0; i < s.length(); i++) { - if (s.charAt(i) == '_') { - rightiness+=i; - } - } - return rightiness; - } - - private Token joinTokens(List subList) { - return new Token(TokenType.LITERAL_TERM, - subList.stream().map(t -> t.str).collect(Collectors.joining("_")), - subList.stream().map(t -> t.str).collect(Collectors.joining(" "))); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java deleted file mode 100644 index 10648486..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryVariants.java +++ /dev/null @@ -1,207 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariant; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryVariantSet; -import nu.marginalia.functions.searchquery.query_parser.variant.QueryWord; -import nu.marginalia.util.language.EnglishDictionary; -import nu.marginalia.LanguageModels; -import nu.marginalia.keyword.KeywordExtractor; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import nu.marginalia.language.model.DocumentSentence; -import nu.marginalia.language.model.WordSpan; - -import java.util.*; -import java.util.regex.Pattern; - -public class QueryVariants { - private final KeywordExtractor keywordExtractor; - private final TermFrequencyDict dict; - - private final EnglishDictionary englishDictionary; - private final ThreadLocal sentenceExtractor; - - public QueryVariants(LanguageModels lm, - TermFrequencyDict dict, - EnglishDictionary englishDictionary) { - this.englishDictionary = englishDictionary; - this.keywordExtractor = new KeywordExtractor(); - this.sentenceExtractor = ThreadLocal.withInitial(() -> new SentenceExtractor(lm)); - this.dict = dict; - } - - - - public QueryVariantSet getQueryVariants(List query) { - final JoinedQueryAndNonLiteralTokens joinedQuery = joinQuery(query); - - final TreeMap> byStart = new TreeMap<>(); - - var se = sentenceExtractor.get(); - var sentence = se.extractSentence(joinedQuery.joinedQuery); - - for (int i = 0; i < sentence.posTags.length; i++) { - if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) { - sentence.posTags[i] = "NNP"; - } - else if ("JJ".equals(sentence.posTags[i]) || "CD".equals(sentence.posTags[i]) || sentence.posTags[i].startsWith("P")) { - sentence.posTags[i] = "NNP"; - sentence.setIsStopWord(i, false); - } - } - - for (var kw : keywordExtractor.getKeywordsFromSentence(sentence)) { - byStart.computeIfAbsent(kw.start, k -> new ArrayList<>()).add(kw); - } - - final List> livingSpans = new ArrayList<>(); - - var first = byStart.firstEntry(); - if (first == null) { - var span = new WordSpan(0, sentence.length()); - byStart.put(0, List.of(span)); - } - else if (first.getKey() > 0) { - List elongatedFirstWords = new ArrayList<>(first.getValue().size()); - - first.getValue().forEach(span -> { - elongatedFirstWords.add(new WordSpan(0, span.start)); - elongatedFirstWords.add(new WordSpan(0, span.end)); - }); - - byStart.put(0, elongatedFirstWords); - } - - final List> goodSpans = getWordSpans(byStart, sentence, livingSpans); - - List> faithfulQueries = new ArrayList<>(); - List> alternativeQueries = new ArrayList<>(); - - for (var ls : goodSpans) { - var last = ls.get(ls.size() - 1); - - if (!last.wordOriginal.isBlank() && !Character.isUpperCase(last.wordOriginal.charAt(0))) { - var altLast = englishDictionary.getWordVariants(last.word); - for (String s : altLast) { - List newList = new ArrayList<>(ls.size()); - for (int i = 0; i < ls.size() - 1; i++) { - newList.add(ls.get(i).word); - } - newList.add(s); - alternativeQueries.add(newList); - } - } - - } - - QueryVariantSet returnValue = new QueryVariantSet(); - - returnValue.faithful.addAll(evaluateQueries(faithfulQueries)); - returnValue.alternative.addAll(evaluateQueries(alternativeQueries)); - - returnValue.faithful.sort(Comparator.comparing(QueryVariant::getValue)); - returnValue.alternative.sort(Comparator.comparing(QueryVariant::getValue)); - - returnValue.nonLiterals.addAll(joinedQuery.nonLiterals); - - return returnValue; - } - - final Pattern underscore = Pattern.compile("_"); - - private List evaluateQueries(List> queryStrings) { - Set variantsSet = new HashSet<>(); - List ret = new ArrayList<>(); - for (var lst : queryStrings) { - double q = 0; - for (var word : lst) { - String[] parts = underscore.split(word); - double qp = 0; - for (String part : parts) { - qp += 1./(1+ dict.getTermFreq(part)); - } - q += 1.0 / qp; - } - var qv = new QueryVariant(lst, q); - if (variantsSet.add(qv)) { - ret.add(qv); - } - } - return ret; - } - - private List> getWordSpans(TreeMap> byStart, DocumentSentence sentence, List> livingSpans) { - List> goodSpans = new ArrayList<>(); - for (int i = 0; i < 1; i++) { - var spans = byStart.get(i); - - - if (spans == null ) - continue; - - for (var span : spans) { - ArrayList fragment = new ArrayList<>(); - fragment.add(span); - livingSpans.add(fragment); - } - - if (sentence.posTags[i].startsWith("N") || sentence.posTags[i].startsWith("V")) break; - } - - - while (!livingSpans.isEmpty()) { - - final List> newLivingSpans = new ArrayList<>(livingSpans.size()); - - for (var span : livingSpans) { - int end = span.get(span.size()-1).end; - - if (end == sentence.length()) { - var gs = new ArrayList(span.size()); - for (var s : span) { - gs.add(new QueryWord(sentence.constructStemmedWordFromSpan(s), sentence.constructWordFromSpan(s), - s.size() == 1 ? sentence.words[s.start] : "")); - } - goodSpans.add(gs); - } - var nextWordsKey = byStart.ceilingKey(end); - - if (null == nextWordsKey) - continue; - - for (var next : byStart.get(nextWordsKey)) { - var newSpan = new ArrayList(span.size() + 1); - newSpan.addAll(span); - newSpan.add(next); - newLivingSpans.add(newSpan); - } - } - - livingSpans.clear(); - livingSpans.addAll(newLivingSpans); - } - - return goodSpans; - } - - - private JoinedQueryAndNonLiteralTokens joinQuery(List query) { - StringJoiner s = new StringJoiner(" "); - List leftovers = new ArrayList<>(5); - - for (var t : query) { - if (t.type == TokenType.LITERAL_TERM) { - s.add(t.displayStr); - } - else { - leftovers.add(t); - } - } - - return new JoinedQueryAndNonLiteralTokens(s.toString(), leftovers); - } - - record JoinedQueryAndNonLiteralTokens(String joinedQuery, List nonLiterals) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java similarity index 94% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java index 07f65c95..b7c4e594 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWord.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import ca.rmen.porterstemmer.PorterStemmer; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java similarity index 82% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index f9902733..474c4788 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import org.jetbrains.annotations.NotNull; @@ -15,8 +15,7 @@ import java.util.stream.Stream; public class QWordGraph implements Iterable { - public record QWordGraphLink(QWord from, QWord to) { - } + public record QWordGraphLink(QWord from, QWord to) {} private final List links = new ArrayList<>(); private final Map> fromTo = new HashMap<>(); @@ -121,8 +120,6 @@ public class QWordGraph implements Iterable { // understanding which vertexes can be re-ordered without changing // the semantics of the encoded query. public boolean isBypassed(QWord word, QWord begin, QWord end) { - assert word.isOriginal() : "Can only bypass original words"; - Set edge = new HashSet<>(); Set visited = new HashSet<>(); @@ -163,6 +160,7 @@ public class QWordGraph implements Iterable { List edge = new ArrayList<>(); List visited = new ArrayList<>(); + visited.add(begin); edge.add(begin); while (!edge.isEmpty()) { @@ -172,7 +170,9 @@ public class QWordGraph implements Iterable { if (Objects.equals(w, end)) continue; - assert (!w.isEnd() && end.isEnd()) : "Graph has a path beyond the specified end vertex"; + if (w.isEnd()) { + assert end.isEnd() : "Graph has a path beyond the specified end vertex " + end; + } next.addAll(getNext(w)); } @@ -182,7 +182,7 @@ public class QWordGraph implements Iterable { edge = next; } - return visited; + return visited.stream().distinct().toList(); } /** Returns a list of subgraphs that are connected on the path from @@ -201,7 +201,7 @@ public class QWordGraph implements Iterable { List points = nodesBetween(begin, end) .stream() - .filter(w -> isBypassed(w, begin, end)) + .filter(w -> !isBypassed(w, begin, end)) .toList(); for (int i = 0; i < points.size() - 1; i++) { @@ -214,6 +214,36 @@ public class QWordGraph implements Iterable { return subgraphs; } + public String compileToQuery() { + return compileToQuery(QWord.beg(), QWord.end()); + } + + public String compileToQuery(QWord begin, QWord end) { + StringJoiner sj = new StringJoiner(" "); + + for (var subgraph : getSubgraphs(begin, end)) { + if (getNext(subgraph.from).equals(List.of(subgraph.to))) { + if (subgraph.from.isBeg()) + continue; + + sj.add(subgraph.from.word()); + } + else { + StringJoiner branchJoiner = new StringJoiner(" | ", "( ", " )"); + if (Objects.equals(subgraph.from, begin)) { + for (QWord path : getNext(subgraph.from)) { + branchJoiner.add(compileToQuery(path, subgraph.to)); + } + } + else { + branchJoiner.add(compileToQuery(subgraph.from, subgraph.to)); + } + sj.add(branchJoiner.toString()); + } + } + + return sj.toString(); + } @NotNull @Override diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java deleted file mode 100644 index 18987aea..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/ExpansionStrategy.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import nu.marginalia.functions.searchquery.query_parser.variant.model.QWordGraph; - -public interface ExpansionStrategy { - void expand(QWordGraph graph); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java deleted file mode 100644 index 8d24387b..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariant.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.Getter; -import lombok.ToString; - -import java.util.List; - -@AllArgsConstructor -@Getter -@ToString -@EqualsAndHashCode -public class QueryVariant { - public final List terms; - public final double value; -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java deleted file mode 100644 index b01fbd5e..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryVariantSet.java +++ /dev/null @@ -1,21 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.Getter; -import lombok.ToString; -import nu.marginalia.functions.searchquery.query_parser.token.Token; - -import java.util.ArrayList; -import java.util.List; - -@Getter -@ToString -public class QueryVariantSet { - public final List faithful = new ArrayList<>(); - public final List alternative = new ArrayList<>(); - - public final List nonLiterals = new ArrayList<>(); - - public boolean isEmpty() { - return faithful.isEmpty() && alternative.isEmpty() && nonLiterals.isEmpty(); - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java deleted file mode 100644 index 9c158a43..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/QueryWord.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import lombok.AllArgsConstructor; - -@AllArgsConstructor -public class QueryWord { - public final String stemmed; - public final String word; - public final String wordOriginal; -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java deleted file mode 100644 index 2c1a5bfb..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/variant/VariantStrategy.java +++ /dev/null @@ -1,8 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.variant; - -import java.util.Collection; -import java.util.List; - -public interface VariantStrategy { - Collection> constructVariants(List ls); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 9ac7c795..3c0e5219 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -11,8 +11,6 @@ import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; -import nu.marginalia.functions.searchquery.query_parser.QueryPermutation; -import nu.marginalia.functions.searchquery.query_parser.QueryVariants; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; import nu.marginalia.term_frequency_dict.TermFrequencyDict; @@ -29,43 +27,19 @@ public class QueryFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); private static final int RETAIN_QUERY_VARIANT_COUNT = 5; - private final ThreadLocal queryVariants; private final QueryParser queryParser = new QueryParser(); @Inject public QueryFactory(LanguageModels lm, TermFrequencyDict dict, - EnglishDictionary englishDictionary) { - this.queryVariants = ThreadLocal.withInitial(() -> new QueryVariants(lm ,dict, englishDictionary)); + EnglishDictionary englishDictionary) + { } - public QueryPermutation getQueryPermutation() { - return new QueryPermutation(queryVariants.get()); - } public ProcessedQuery createQuery(QueryParams params) { - final var processedQuery = createQuery(getQueryPermutation(), params); - final List subqueries = processedQuery.specs.subqueries; - - // There used to be a piece of logic here that would try to figure out which one of these subqueries were the "best", - // it's gone for the moment, but it would be neat if it resurrected somehow - - trimArray(subqueries, RETAIN_QUERY_VARIANT_COUNT); - - return processedQuery; - } - - private void trimArray(List arr, int maxSize) { - if (arr.size() > maxSize) { - arr.subList(0, arr.size() - maxSize).clear(); - } - } - - public ProcessedQuery createQuery(QueryPermutation queryPermutation, - QueryParams params) - { final var query = params.humanQuery(); if (query.length() > 1000) { @@ -100,17 +74,19 @@ public class QueryFactory { t.visit(qualityLimits); } - var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); +// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); List subqueries = new ArrayList<>(); + QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); + domain = termsAccumulator.domain; - for (var parts : queryPermutations) { - QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(parts); - - domain = termsAccumulator.domain; - - SearchSubquery subquery = termsAccumulator.createSubquery(); - subqueries.add(subquery); - } +// for (var parts : queryPermutations) { +// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); +// +// domain = termsAccumulator.domain; +// +// SearchSubquery subquery = termsAccumulator.createSubquery(); +// subqueries.add(subquery); +// } List domainIds = params.domainIds(); diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java similarity index 83% rename from code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java rename to code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index a88e4d63..bd16b3cb 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/variant/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.functions.searchquery.query_parser.variant.model; +package nu.marginalia.functions.searchquery.query_parser.model; import org.junit.jupiter.api.Test; @@ -10,11 +10,13 @@ class QWordGraphTest { System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); + System.out.println(graph.compileToQuery()); graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); System.out.println("--"); graph.addVariant(graph.nodes().get(1), "sup"); + System.out.println(graph.compileToQuery()); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); @@ -23,6 +25,8 @@ class QWordGraphTest { graph.nodes().forEach(System.out::println); graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); + graph.addVariant(graph.nodes().get(2), "globe"); + System.out.println(graph.compileToQuery()); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); From 0ebadd03a51557d6554a83808996be84cf720e28 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:01:21 +0100 Subject: [PATCH 11/90] (WIP) Query rendering finally beginning to look like it works --- .../query_parser/model/QWordGraph.java | 365 ++++++++++++++---- .../query_parser/model/QWordGraphTest.java | 111 ++++++ 2 files changed, 408 insertions(+), 68 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 474c4788..1d8fcd70 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -49,9 +49,9 @@ public class QWordGraph implements Iterable { var newWord = new QWord(wordId++, original, word); - for (var prev : getPrev(original)) + for (var prev : getPrevOriginal(original)) addLink(prev, newWord); - for (var next : getNext(original)) + for (var next : getNextOriginal(original)) addLink(newWord, next); } @@ -93,6 +93,12 @@ public class QWordGraph implements Iterable { .collect(Collectors.toList()); } + public QWord node(String word) { + return nodes().stream() + .filter(n -> n.word().equals(word)) + .findFirst() + .orElseThrow(); + } public List getNext(QWord word) { return fromTo.getOrDefault(word, List.of()); @@ -147,34 +153,25 @@ public class QWordGraph implements Iterable { return false; } - /** Returns a set of all nodes that are between 'begin' and 'end' in the graph, - * including the terminal nodes. This is useful for breaking up the graph into - * smaller components that can be evaluated in any order. - *

    - * It is assumed that there is a path from 'begin' to 'end' in the graph, and no - * other paths that bypass 'end'. - *

    - * The nodes are returned in the order they are encountered in a breadth-first search. - */ - public List nodesBetween(QWord begin, QWord end) { - List edge = new ArrayList<>(); - List visited = new ArrayList<>(); + public Map> forwardReachability() { + Map> ret = new HashMap<>(); - visited.add(begin); - edge.add(begin); + Set edge = Set.of(QWord.beg()); + Set visited = new HashSet<>(); while (!edge.isEmpty()) { - List next = new ArrayList<>(); + Set next = new LinkedHashSet<>(); for (var w : edge) { - if (Objects.equals(w, end)) - continue; - if (w.isEnd()) { - assert end.isEnd() : "Graph has a path beyond the specified end vertex " + end; + for (var n : getNext(w)) { + var set = ret.computeIfAbsent(n, k -> new HashSet<>()); + + set.add(w); + set.addAll(ret.getOrDefault(w, Set.of())); + + next.add(n); } - - next.addAll(getNext(w)); } next.removeAll(visited); @@ -182,67 +179,299 @@ public class QWordGraph implements Iterable { edge = next; } - return visited.stream().distinct().toList(); + return ret; } - /** Returns a list of subgraphs that are connected on the path from - * 'begin' to 'end'. This is useful for breaking up the graph into - * smaller components that can be evaluated in any order. - *

    - * The subgraphs are specified by their predecessor and successor nodes, - * - */ - public List getSubgraphs(QWord begin, QWord end) { - // Short-circuit for the common and simple case - if (getNext(begin).equals(List.of(end))) - return List.of(new QWordGraphLink(begin, end)); + public Map> reverseReachability() { + Map> ret = new HashMap<>(); - List subgraphs = new ArrayList<>(); + Set edge = Set.of(QWord.end()); + Set visited = new HashSet<>(); - List points = nodesBetween(begin, end) - .stream() - .filter(w -> !isBypassed(w, begin, end)) - .toList(); + while (!edge.isEmpty()) { + Set prev = new LinkedHashSet<>(); - for (int i = 0; i < points.size() - 1; i++) { - var a = points.get(i); - var b = points.get(i+1); + for (var w : edge) { - subgraphs.add(new QWordGraphLink(a, b)); + for (var p : getPrev(w)) { + var set = ret.computeIfAbsent(p, k -> new HashSet<>()); + + set.add(w); + set.addAll(ret.getOrDefault(w, Set.of())); + + prev.add(p); + } + } + + prev.removeAll(visited); + visited.addAll(prev); + edge = prev; } - return subgraphs; + return ret; + } + + public record ReachabilityData(List sortedNodes, + Map sortOrder, + + Map> forward, + Map> reverse) + { + public Set forward(QWord node) { + return forward.getOrDefault(node, Set.of()); + } + public Set reverse(QWord node) { + return reverse.getOrDefault(node, Set.of()); + } + + public Comparator topologicalComparator() { + return Comparator.comparing(sortOrder::get); + } + + } + + /** Gather data about graph reachability, including the topological order of nodes */ + public ReachabilityData reachability() { + var forwardReachability = forwardReachability(); + var reverseReachability = reverseReachability(); + + List nodes = new ArrayList<>(nodes()); + nodes.sort(new SetMembershipComparator<>(forwardReachability)); + + Map topologicalOrder = new HashMap<>(); + for (int i = 0; i < nodes.size(); i++) { + topologicalOrder.put(nodes.get(i), i); + } + + return new ReachabilityData(nodes, topologicalOrder, forwardReachability, reverseReachability); + } + + static class SetMembershipComparator implements Comparator { + private final Map> membership; + + SetMembershipComparator(Map> membership) { + this.membership = membership; + } + + @Override + public int compare(T o1, T o2) { + return Boolean.compare(isIn(o1, o2), isIn(o2, o1)); + } + + private boolean isIn(T a, T b) { + return membership.getOrDefault(a, Set.of()).contains(b); + } } public String compileToQuery() { - return compileToQuery(QWord.beg(), QWord.end()); + var wp = new WordPaths(QWord.beg(), QWord.end()); + return wp.render(reachability()); } - public String compileToQuery(QWord begin, QWord end) { - StringJoiner sj = new StringJoiner(" "); - for (var subgraph : getSubgraphs(begin, end)) { - if (getNext(subgraph.from).equals(List.of(subgraph.to))) { - if (subgraph.from.isBeg()) - continue; + class WordPaths { + private final Set paths; - sj.add(subgraph.from.word()); - } - else { - StringJoiner branchJoiner = new StringJoiner(" | ", "( ", " )"); - if (Objects.equals(subgraph.from, begin)) { - for (QWord path : getNext(subgraph.from)) { - branchJoiner.add(compileToQuery(path, subgraph.to)); - } - } - else { - branchJoiner.add(compileToQuery(subgraph.from, subgraph.to)); - } - sj.add(branchJoiner.toString()); - } + public final QWord begin; + public final QWord end; + + public WordPaths(Collection paths) { + this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); + + begin = null; + end = null; } - return sj.toString(); + public WordPaths(QWord begin, QWord end) { + this.begin = begin; + this.end = end; + + this.paths = Collections.unmodifiableSet(listPaths()); + } + + public String render(ReachabilityData reachability) { + if (paths.size() == 1) { + return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); + } + + Map commonality = paths.stream().flatMap(WordPath::stream) + .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + + Set commonToAll = new HashSet<>(); + Set notCommonToAll = new HashSet<>(); + + commonality.forEach((k, v) -> { + if (v == paths.size()) { + commonToAll.add(k); + } + else { + notCommonToAll.add(k); + } + }); + + StringJoiner concat = new StringJoiner(" "); + if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + + commonToAll.stream() + .sorted(reachability.topologicalComparator()) + .map(QWord::word) + .forEach(concat::add); + + // Deal portion of the paths that do not all share a common word + if (!notCommonToAll.isEmpty()) { + + List nonOverlappingPortions = new ArrayList<>(); + + for (var path : paths) { + // Project the path onto the divergent nodes (i.e. remove common nodes) + var np = path.project(notCommonToAll); + if (np.isEmpty()) + continue; + nonOverlappingPortions.add(np); + } + + if (nonOverlappingPortions.size() > 1) { + var wp = new WordPaths(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } + else if (!nonOverlappingPortions.isEmpty()) { + var wp = new WordPaths(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } + } + } + else if (commonality.size() > 1) { // The case where no words are common to all paths + + // Sort the words by commonality, so that we can consider the most common words first + List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); + + Map> pathsByCommonWord = new HashMap<>(); + + // Mutable copy of the paths + List allDivergentPaths = new ArrayList<>(paths); + + for (var qw : byCommonality) { + if (allDivergentPaths.isEmpty()) + break; + + var iter = allDivergentPaths.iterator(); + while (iter.hasNext()) { + var path = iter.next(); + + if (!path.contains(qw)) { + continue; + } + + pathsByCommonWord + .computeIfAbsent(qw, k -> new ArrayList<>()) + .add(path.without(qw)); // Remove the common word from the path + + iter.remove(); + } + } + + var branches = pathsByCommonWord.entrySet().stream().map(e -> { + String commonWord = e.getKey().word(); + String branchPart = new WordPaths(e.getValue()).render(reachability); + return STR."\{commonWord} \{branchPart}"; + }) + .collect(Collectors.joining(" | ", " ( ", " ) ")); + + concat.add(branches); + + } + + // Remove any double spaces that may have been introduced + return concat.toString().replaceAll("\\s+", " "); + } + + + public Set listPaths() { + assert begin != null; + assert end != null; + + Set paths = new HashSet<>(); + listPaths(paths, new LinkedList<>(), begin, end); + return paths; + } + + private void listPaths(Set acc, + LinkedList stack, + QWord start, + QWord end) + { + stack.addLast(start); + + if (Objects.equals(start, end)) { + var nodes = new HashSet<>(stack); + + nodes.remove(this.begin); + nodes.remove(this.end); + + acc.add(new WordPath(nodes)); + } + else { + for (var next : getNext(start)) { + listPaths(acc, stack, next, end); + } + } + + stack.removeLast(); + } + } + + public static class WordPath { + private final Set nodes; + + WordPath(Collection nodes) { + this.nodes = new HashSet<>(nodes); + } + + public boolean contains(QWord node) { + return nodes.contains(node); + } + + public WordPath without(QWord word) { + Set newNodes = new HashSet<>(nodes); + newNodes.remove(word); + return new WordPath(newNodes); + } + + public Stream stream() { + return nodes.stream(); + } + + public WordPath project(Set nodes) { + return new WordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + WordPath wordPath = (WordPath) o; + + return nodes.equals(wordPath.nodes); + } + + public boolean isEmpty() { + return nodes.isEmpty(); + } + + public int size() { + return nodes.size(); + } + + @Override + public int hashCode() { + return nodes.hashCode(); + } + + @Override + public String toString() { + return STR."WordPath{nodes=\{nodes}\{'}'}"; + } } @NotNull @@ -258,7 +487,7 @@ public class QWordGraph implements Iterable { @Override public QWord next() { - pos = getNextOriginal(pos).get(0); + pos = getNextOriginal(pos).getFirst(); return pos; } }; diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index bd16b3cb..276d8697 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -2,6 +2,11 @@ package nu.marginalia.functions.searchquery.query_parser.model; import org.junit.jupiter.api.Test; +import java.util.Comparator; +import java.util.Set; + +import static org.junit.jupiter.api.Assertions.assertEquals; + class QWordGraphTest { @Test @@ -11,12 +16,14 @@ class QWordGraphTest { System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println(graph.compileToQuery()); + graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); System.out.println("--"); graph.addVariant(graph.nodes().get(1), "sup"); System.out.println(graph.compileToQuery()); + graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); System.out.println("--"); @@ -33,5 +40,109 @@ class QWordGraphTest { graph.links().forEach(System.out::println); System.out.println("--"); graph.nodes().forEach(System.out::println); + graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); + } + + @Test + void forwardReachability() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("b"), "d"); + + var reachability = graph.forwardReachability(); + + System.out.println(reachability.get(graph.node("a"))); + System.out.println(reachability.get(graph.node("b"))); + System.out.println(reachability.get(graph.node("c"))); + System.out.println(reachability.get(graph.node("d"))); + + assertEquals(Set.of(graph.node(" ^ ")), reachability.get(graph.node("a"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a")), reachability.get(graph.node("b"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a")), reachability.get(graph.node("d"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a"), graph.node("b"), graph.node("d")), reachability.get(graph.node("c"))); + assertEquals(Set.of(graph.node(" ^ "), graph.node("a"), graph.node("b"), graph.node("d"), graph.node("c")), reachability.get(graph.node(" $ "))); + } + + + @Test + void reverseReachability() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("b"), "d"); + + var reachability = graph.reverseReachability(); + + System.out.println(reachability.get(graph.node("a"))); + System.out.println(reachability.get(graph.node("b"))); + System.out.println(reachability.get(graph.node("c"))); + System.out.println(reachability.get(graph.node("d"))); + + assertEquals(Set.of(graph.node(" $ ")), reachability.get(graph.node("c"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c")), reachability.get(graph.node("b"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c")), reachability.get(graph.node("d"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c"), graph.node("b"), graph.node("d")), reachability.get(graph.node("a"))); + assertEquals(Set.of(graph.node(" $ "), graph.node("c"), graph.node("b"), graph.node("d"), graph.node("a")), reachability.get(graph.node(" ^ "))); + } + + @Test + void testCompile1() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("b"), "d"); + + assertEquals(" ^ a(b|d)c $ ", graph.compileToQuery()); + } + @Test + void testCompile2() { + // Construct a graph like + + // ^ - a - b - c - $ + QWordGraph graph = new QWordGraph("a", "b", "c"); + + assertEquals(" ^ abc $ ", graph.compileToQuery()); + } + + @Test + void testCompile3() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("a"), "d"); + assertEquals(" ^ (a|d)bc $ ", graph.compileToQuery()); + } + + @Test + void testCompile4() { + // Construct a graph like + + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("c"), "d"); + assertEquals(" ^ ab(c|d) $ ", graph.compileToQuery()); + } + + @Test + void testCompile5() { + // Construct a graph like + + // /- e -\ + // ^ - a - b - c - $ + // \- d -/ + QWordGraph graph = new QWordGraph("a", "b", "c"); + graph.addVariant(graph.node("c"), "d"); + graph.addVariant(graph.node("b"), "e"); + assertEquals(" ^ a(b|e)(c|d) $ ", graph.compileToQuery()); } } \ No newline at end of file From 6f567fbea89a568b620554530df5de81b89d7142 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:11:26 +0100 Subject: [PATCH 12/90] (qs, WIP) Fix output determinism, fix tests --- .../searchquery/query_parser/model/QWordGraph.java | 6 ++++-- .../query_parser/model/QWordGraphTest.java | 11 ++++++----- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 1d8fcd70..10aae867 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -370,7 +370,9 @@ public class QWordGraph implements Iterable { } } - var branches = pathsByCommonWord.entrySet().stream().map(e -> { + var branches = pathsByCommonWord.entrySet().stream() + .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) + .map(e -> { String commonWord = e.getKey().word(); String branchPart = new WordPaths(e.getValue()).render(reachability); return STR."\{commonWord} \{branchPart}"; @@ -382,7 +384,7 @@ public class QWordGraph implements Iterable { } // Remove any double spaces that may have been introduced - return concat.toString().replaceAll("\\s+", " "); + return concat.toString().replaceAll("\\s+", " ").trim(); } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index 276d8697..f3201b9d 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -99,8 +99,9 @@ class QWordGraphTest { QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("b"), "d"); - assertEquals(" ^ a(b|d)c $ ", graph.compileToQuery()); + assertEquals("a c ( b | d )", graph.compileToQuery()); } + @Test void testCompile2() { // Construct a graph like @@ -108,7 +109,7 @@ class QWordGraphTest { // ^ - a - b - c - $ QWordGraph graph = new QWordGraph("a", "b", "c"); - assertEquals(" ^ abc $ ", graph.compileToQuery()); + assertEquals("a b c", graph.compileToQuery()); } @Test @@ -119,7 +120,7 @@ class QWordGraphTest { // \- d -/ QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("a"), "d"); - assertEquals(" ^ (a|d)bc $ ", graph.compileToQuery()); + assertEquals("b c ( a | d )", graph.compileToQuery()); } @Test @@ -130,7 +131,7 @@ class QWordGraphTest { // \- d -/ QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("c"), "d"); - assertEquals(" ^ ab(c|d) $ ", graph.compileToQuery()); + assertEquals("a b ( c | d )", graph.compileToQuery()); } @Test @@ -143,6 +144,6 @@ class QWordGraphTest { QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("c"), "d"); graph.addVariant(graph.node("b"), "e"); - assertEquals(" ^ a(b|e)(c|d) $ ", graph.compileToQuery()); + assertEquals("a ( b ( c | d ) | c e )", graph.compileToQuery()); } } \ No newline at end of file From cd1a18c045dc5b23a2e8d32e30872f9cecd295f2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:26:54 +0100 Subject: [PATCH 13/90] (qs, WIP) Break up code and tidy it up a bit --- .../query_parser/model/QWordGraph.java | 210 +----------------- .../model/QWordGraphPathLister.java | 57 +++++ .../query_parser/model/QWordPath.java | 66 ++++++ .../model/QWordPathsRenderer.java | 119 ++++++++++ 4 files changed, 243 insertions(+), 209 deletions(-) create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 10aae867..20e4320d 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -264,218 +264,10 @@ public class QWordGraph implements Iterable { } public String compileToQuery() { - var wp = new WordPaths(QWord.beg(), QWord.end()); - return wp.render(reachability()); + return QWordPathsRenderer.render(this); } - class WordPaths { - private final Set paths; - - public final QWord begin; - public final QWord end; - - public WordPaths(Collection paths) { - this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); - - begin = null; - end = null; - } - - public WordPaths(QWord begin, QWord end) { - this.begin = begin; - this.end = end; - - this.paths = Collections.unmodifiableSet(listPaths()); - } - - public String render(ReachabilityData reachability) { - if (paths.size() == 1) { - return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); - } - - Map commonality = paths.stream().flatMap(WordPath::stream) - .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); - - Set commonToAll = new HashSet<>(); - Set notCommonToAll = new HashSet<>(); - - commonality.forEach((k, v) -> { - if (v == paths.size()) { - commonToAll.add(k); - } - else { - notCommonToAll.add(k); - } - }); - - StringJoiner concat = new StringJoiner(" "); - if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths - - commonToAll.stream() - .sorted(reachability.topologicalComparator()) - .map(QWord::word) - .forEach(concat::add); - - // Deal portion of the paths that do not all share a common word - if (!notCommonToAll.isEmpty()) { - - List nonOverlappingPortions = new ArrayList<>(); - - for (var path : paths) { - // Project the path onto the divergent nodes (i.e. remove common nodes) - var np = path.project(notCommonToAll); - if (np.isEmpty()) - continue; - nonOverlappingPortions.add(np); - } - - if (nonOverlappingPortions.size() > 1) { - var wp = new WordPaths(nonOverlappingPortions); - concat.add(wp.render(reachability)); - } - else if (!nonOverlappingPortions.isEmpty()) { - var wp = new WordPaths(nonOverlappingPortions); - concat.add(wp.render(reachability)); - } - } - } - else if (commonality.size() > 1) { // The case where no words are common to all paths - - // Sort the words by commonality, so that we can consider the most common words first - List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); - - Map> pathsByCommonWord = new HashMap<>(); - - // Mutable copy of the paths - List allDivergentPaths = new ArrayList<>(paths); - - for (var qw : byCommonality) { - if (allDivergentPaths.isEmpty()) - break; - - var iter = allDivergentPaths.iterator(); - while (iter.hasNext()) { - var path = iter.next(); - - if (!path.contains(qw)) { - continue; - } - - pathsByCommonWord - .computeIfAbsent(qw, k -> new ArrayList<>()) - .add(path.without(qw)); // Remove the common word from the path - - iter.remove(); - } - } - - var branches = pathsByCommonWord.entrySet().stream() - .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) - .map(e -> { - String commonWord = e.getKey().word(); - String branchPart = new WordPaths(e.getValue()).render(reachability); - return STR."\{commonWord} \{branchPart}"; - }) - .collect(Collectors.joining(" | ", " ( ", " ) ")); - - concat.add(branches); - - } - - // Remove any double spaces that may have been introduced - return concat.toString().replaceAll("\\s+", " ").trim(); - } - - - public Set listPaths() { - assert begin != null; - assert end != null; - - Set paths = new HashSet<>(); - listPaths(paths, new LinkedList<>(), begin, end); - return paths; - } - - private void listPaths(Set acc, - LinkedList stack, - QWord start, - QWord end) - { - stack.addLast(start); - - if (Objects.equals(start, end)) { - var nodes = new HashSet<>(stack); - - nodes.remove(this.begin); - nodes.remove(this.end); - - acc.add(new WordPath(nodes)); - } - else { - for (var next : getNext(start)) { - listPaths(acc, stack, next, end); - } - } - - stack.removeLast(); - } - } - - public static class WordPath { - private final Set nodes; - - WordPath(Collection nodes) { - this.nodes = new HashSet<>(nodes); - } - - public boolean contains(QWord node) { - return nodes.contains(node); - } - - public WordPath without(QWord word) { - Set newNodes = new HashSet<>(nodes); - newNodes.remove(word); - return new WordPath(newNodes); - } - - public Stream stream() { - return nodes.stream(); - } - - public WordPath project(Set nodes) { - return new WordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); - } - - @Override - public boolean equals(Object o) { - if (this == o) return true; - if (o == null || getClass() != o.getClass()) return false; - - WordPath wordPath = (WordPath) o; - - return nodes.equals(wordPath.nodes); - } - - public boolean isEmpty() { - return nodes.isEmpty(); - } - - public int size() { - return nodes.size(); - } - - @Override - public int hashCode() { - return nodes.hashCode(); - } - - @Override - public String toString() { - return STR."WordPath{nodes=\{nodes}\{'}'}"; - } - } - @NotNull @Override public Iterator iterator() { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java new file mode 100644 index 00000000..979a419b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java @@ -0,0 +1,57 @@ +package nu.marginalia.functions.searchquery.query_parser.model; + +import java.util.HashSet; +import java.util.LinkedList; +import java.util.Objects; +import java.util.Set; + +/** Utility class for listing each path in a {@link QWordGraph}, from the beginning node to the end. + * Normally this would be a risk for combinatorial explosion, but in practice the graph will be constructed + * in a way that avoids this risk. + * */ +public class QWordGraphPathLister { + private final QWordGraph graph; + + public QWordGraphPathLister(QWordGraph graph) { + this.graph = graph; + } + + static Set listPaths(QWordGraph graph) { + return new QWordGraphPathLister(graph).listPaths(); + } + + Set listPaths() { + + Set paths = new HashSet<>(); + listPaths(paths, new LinkedList<>(), QWord.beg(), QWord.end()); + return paths; + } + + void listPaths(Set acc, + LinkedList stack, + QWord start, + QWord end) + { + stack.addLast(start); + + if (Objects.equals(start, end)) { + var nodes = new HashSet<>(stack); + + // Remove the start and end nodes from the path, as these are + // not part of the query but merely used to simplify the construction + // of the graph + + nodes.remove(QWord.beg()); + nodes.remove(QWord.end()); + + acc.add(new QWordPath(nodes)); + } + else { + for (var next : graph.getNext(start)) { + listPaths(acc, stack, next, end); + } + } + + stack.removeLast(); + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java new file mode 100644 index 00000000..f8e859e3 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java @@ -0,0 +1,66 @@ +package nu.marginalia.functions.searchquery.query_parser.model; + +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +/** Represents a path of QWords in a QWordGraph. Since the order of operations when + * evaluating a query does not affect its semantics, only performance, the order of the + * nodes in the path is not significant; thus the path is represented with a set. + */ +public class QWordPath { + private final Set nodes; + + QWordPath(Collection nodes) { + this.nodes = new HashSet<>(nodes); + } + + public boolean contains(QWord node) { + return nodes.contains(node); + } + + /** Construct a new path by removing a word from the path. */ + public QWordPath without(QWord word) { + Set newNodes = new HashSet<>(nodes); + newNodes.remove(word); + return new QWordPath(newNodes); + } + + public Stream stream() { + return nodes.stream(); + } + + public QWordPath project(Set nodes) { + return new QWordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + QWordPath wordPath = (QWordPath) o; + + return nodes.equals(wordPath.nodes); + } + + public boolean isEmpty() { + return nodes.isEmpty(); + } + + public int size() { + return nodes.size(); + } + + @Override + public int hashCode() { + return nodes.hashCode(); + } + + @Override + public String toString() { + return STR."WordPath{nodes=\{nodes}\{'}'}"; + } +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java new file mode 100644 index 00000000..bc55d03b --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -0,0 +1,119 @@ +package nu.marginalia.functions.searchquery.query_parser.model; + +import java.util.*; +import java.util.stream.Collectors; + +/** Renders a set of QWordPaths into a human-readable infix-style expression. It's not guaranteed to find + * the globally optimal expression, but rather uses a greedy algorithm as a tradeoff in effort to outcome. + */ +class QWordPathsRenderer { + private final Set paths; + + private QWordPathsRenderer(Collection paths) { + this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); + } + + private QWordPathsRenderer(QWordGraph graph) { + this.paths = Collections.unmodifiableSet(QWordGraphPathLister.listPaths(graph)); + } + + public static String render(QWordGraph graph) { + return new QWordPathsRenderer(graph).render(graph.reachability()); + } + + String render(QWordGraph.ReachabilityData reachability) { + if (paths.size() == 1) { + return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); + } + + Map commonality = paths.stream().flatMap(QWordPath::stream) + .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + + Set commonToAll = new HashSet<>(); + Set notCommonToAll = new HashSet<>(); + + commonality.forEach((k, v) -> { + if (v == paths.size()) { + commonToAll.add(k); + } else { + notCommonToAll.add(k); + } + }); + + StringJoiner concat = new StringJoiner(" "); + if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + + commonToAll.stream() + .sorted(reachability.topologicalComparator()) + .map(QWord::word) + .forEach(concat::add); + + // Deal portion of the paths that do not all share a common word + if (!notCommonToAll.isEmpty()) { + + List nonOverlappingPortions = new ArrayList<>(); + + for (var path : paths) { + // Project the path onto the divergent nodes (i.e. remove common nodes) + var np = path.project(notCommonToAll); + if (np.isEmpty()) + continue; + nonOverlappingPortions.add(np); + } + + if (nonOverlappingPortions.size() > 1) { + var wp = new QWordPathsRenderer(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } else if (!nonOverlappingPortions.isEmpty()) { + var wp = new QWordPathsRenderer(nonOverlappingPortions); + concat.add(wp.render(reachability)); + } + } + } else if (commonality.size() > 1) { // The case where no words are common to all paths + + // Sort the words by commonality, so that we can consider the most common words first + List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); + + Map> pathsByCommonWord = new HashMap<>(); + + // Mutable copy of the paths + List allDivergentPaths = new ArrayList<>(paths); + + for (var commonWord : byCommonality) { + if (allDivergentPaths.isEmpty()) + break; + + var iter = allDivergentPaths.iterator(); + while (iter.hasNext()) { + var path = iter.next(); + + if (!path.contains(commonWord)) { + continue; + } + + pathsByCommonWord + .computeIfAbsent(commonWord, k -> new ArrayList<>()) + .add(path.without(commonWord)); // Remove the common word from the path + + iter.remove(); + } + } + + var branches = pathsByCommonWord.entrySet().stream() + .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) // Sort by topological order to ensure consistent output + .map(e -> { + String commonWord = e.getKey().word(); + String branchPart = new QWordPathsRenderer(e.getValue()).render(reachability); + return STR."\{commonWord} \{branchPart}"; + }) + .collect(Collectors.joining(" | ", " ( ", " ) ")); + + concat.add(branches); + + } + + // Remove any double spaces that may have been introduced + return concat.toString().replaceAll("\\s+", " ").trim(); + } + +} From eda926767e1602eca5fe5334dcfb40620b48f827 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 13:54:30 +0100 Subject: [PATCH 14/90] (qs, WIP) Tidy it up a bit --- .../searchquery/query_parser/model/QWordPathsRenderer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index bc55d03b..ff4dd60c 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -10,11 +10,11 @@ class QWordPathsRenderer { private final Set paths; private QWordPathsRenderer(Collection paths) { - this.paths = Collections.unmodifiableSet(new HashSet<>(paths)); + this.paths = Set.copyOf(paths); } private QWordPathsRenderer(QWordGraph graph) { - this.paths = Collections.unmodifiableSet(QWordGraphPathLister.listPaths(graph)); + this.paths = Set.copyOf(QWordGraphPathLister.listPaths(graph)); } public static String render(QWordGraph graph) { From 0bd1e15cce3956bab72153b1775fe8db5d7d0aa7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 14:09:17 +0100 Subject: [PATCH 15/90] (qs, WIP) Tidy it up a bit --- .../query_parser/model/QWordPath.java | 2 + .../model/QWordPathsRenderer.java | 50 ++++++++++++------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java index f8e859e3..daa2a1f1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPath.java @@ -32,6 +32,8 @@ public class QWordPath { return nodes.stream(); } + /** Construct a new path by projecting the path onto a set of nodes, such that + * the nodes in the new set is a strict subset of the provided nodes */ public QWordPath project(Set nodes) { return new QWordPath(this.nodes.stream().filter(nodes::contains).collect(Collectors.toSet())); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index ff4dd60c..a8e96837 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -21,17 +21,26 @@ class QWordPathsRenderer { return new QWordPathsRenderer(graph).render(graph.reachability()); } + /** Render the paths into a human-readable infix-style expression. + *

    + * This method is recursive, but the recursion depth is limited by the + * maximum length of the paths, which is hard limited to a value typically around 10, + * so we don't need to worry about stack overflows here... + */ String render(QWordGraph.ReachabilityData reachability) { if (paths.size() == 1) { return paths.iterator().next().stream().map(QWord::word).collect(Collectors.joining(" ")); } + // Find the commonality of words in the paths + Map commonality = paths.stream().flatMap(QWordPath::stream) .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); - Set commonToAll = new HashSet<>(); - Set notCommonToAll = new HashSet<>(); + // Break the words into two categories: those that are common to all paths, and those that are not + List commonToAll = new ArrayList<>(); + Set notCommonToAll = new HashSet<>(); commonality.forEach((k, v) -> { if (v == paths.size()) { commonToAll.add(k); @@ -40,33 +49,32 @@ class QWordPathsRenderer { } }); - StringJoiner concat = new StringJoiner(" "); - if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + StringJoiner resultJoiner = new StringJoiner(" "); - commonToAll.stream() - .sorted(reachability.topologicalComparator()) - .map(QWord::word) - .forEach(concat::add); + if (!commonToAll.isEmpty()) { // Case where one or more words are common to all paths + commonToAll.sort(reachability.topologicalComparator()); + + for (var word : commonToAll) { + resultJoiner.add(word.word()); + } // Deal portion of the paths that do not all share a common word if (!notCommonToAll.isEmpty()) { List nonOverlappingPortions = new ArrayList<>(); + // Create a new path for each path that does not contain the common words we just printed for (var path : paths) { - // Project the path onto the divergent nodes (i.e. remove common nodes) var np = path.project(notCommonToAll); if (np.isEmpty()) continue; nonOverlappingPortions.add(np); } - if (nonOverlappingPortions.size() > 1) { + // Recurse into the non-overlapping portions + if (!nonOverlappingPortions.isEmpty()) { var wp = new QWordPathsRenderer(nonOverlappingPortions); - concat.add(wp.render(reachability)); - } else if (!nonOverlappingPortions.isEmpty()) { - var wp = new QWordPathsRenderer(nonOverlappingPortions); - concat.add(wp.render(reachability)); + resultJoiner.add(wp.render(reachability)); } } } else if (commonality.size() > 1) { // The case where no words are common to all paths @@ -79,6 +87,7 @@ class QWordPathsRenderer { // Mutable copy of the paths List allDivergentPaths = new ArrayList<>(paths); + // Break the paths into branches by the first common word they contain, in order of decreasing commonality for (var commonWord : byCommonality) { if (allDivergentPaths.isEmpty()) break; @@ -91,10 +100,15 @@ class QWordPathsRenderer { continue; } + // Remove the common word from the path + var newPath = path.without(commonWord); + pathsByCommonWord .computeIfAbsent(commonWord, k -> new ArrayList<>()) - .add(path.without(commonWord)); // Remove the common word from the path + .add(newPath); + // Remove the path from the list of divergent paths since we've now accounted for it and + // we don't want redundant branches: iter.remove(); } } @@ -103,17 +117,17 @@ class QWordPathsRenderer { .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) // Sort by topological order to ensure consistent output .map(e -> { String commonWord = e.getKey().word(); + // Recurse into the branches: String branchPart = new QWordPathsRenderer(e.getValue()).render(reachability); return STR."\{commonWord} \{branchPart}"; }) .collect(Collectors.joining(" | ", " ( ", " ) ")); - concat.add(branches); - + resultJoiner.add(branches); } // Remove any double spaces that may have been introduced - return concat.toString().replaceAll("\\s+", " ").trim(); + return resultJoiner.toString().replaceAll("\\s+", " ").trim(); } } From 98a1adbf8174d8e12b5d449cff19dfa8a40a7a02 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 14:18:26 +0100 Subject: [PATCH 16/90] (qs, WIP) Tidy it up a bit --- .../query_parser/model/QWordGraph.java | 3 ++- .../model/QWordPathsRenderer.java | 25 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 20e4320d..272b7b35 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -225,7 +225,8 @@ public class QWordGraph implements Iterable { } public Comparator topologicalComparator() { - return Comparator.comparing(sortOrder::get); + Comparator comp = Comparator.comparing(sortOrder::get); + return comp.thenComparing(QWord::ord); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index a8e96837..762a7d1b 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -21,6 +21,13 @@ class QWordPathsRenderer { return new QWordPathsRenderer(graph).render(graph.reachability()); } + + private static String render(Collection paths, + QWordGraph.ReachabilityData reachability) + { + return new QWordPathsRenderer(paths).render(reachability); + } + /** Render the paths into a human-readable infix-style expression. *

    * This method is recursive, but the recursion depth is limited by the @@ -34,8 +41,7 @@ class QWordPathsRenderer { // Find the commonality of words in the paths - Map commonality = paths.stream().flatMap(QWordPath::stream) - .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + Map commonality = nodeCommonality(); // Break the words into two categories: those that are common to all paths, and those that are not @@ -72,10 +78,7 @@ class QWordPathsRenderer { } // Recurse into the non-overlapping portions - if (!nonOverlappingPortions.isEmpty()) { - var wp = new QWordPathsRenderer(nonOverlappingPortions); - resultJoiner.add(wp.render(reachability)); - } + resultJoiner.add(render(nonOverlappingPortions, reachability)); } } else if (commonality.size() > 1) { // The case where no words are common to all paths @@ -117,8 +120,10 @@ class QWordPathsRenderer { .sorted(Map.Entry.comparingByKey(reachability.topologicalComparator())) // Sort by topological order to ensure consistent output .map(e -> { String commonWord = e.getKey().word(); + // Recurse into the branches: - String branchPart = new QWordPathsRenderer(e.getValue()).render(reachability); + String branchPart = render(e.getValue(), reachability); + return STR."\{commonWord} \{branchPart}"; }) .collect(Collectors.joining(" | ", " ( ", " ) ")); @@ -130,4 +135,10 @@ class QWordPathsRenderer { return resultJoiner.toString().replaceAll("\\s+", " ").trim(); } + /** Compute how many paths each word is part of */ + private Map nodeCommonality() { + return paths.stream().flatMap(QWordPath::stream) + .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); + } + } From dc65b2ee01a25a51ed1dad00c50cfbdd797e8252 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 28 Mar 2024 16:37:23 +0100 Subject: [PATCH 17/90] (qs, WIP) Clean up dead code --- .../query_parser/model/QWordGraph.java | 37 ++----------------- .../query_parser/model/QWordGraphTest.java | 34 ----------------- 2 files changed, 3 insertions(+), 68 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 272b7b35..4da9a6d1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -10,7 +10,8 @@ import java.util.stream.Stream; * with a single start node and a single end node, denoted by QWord.beg() and QWord.end() respectively. *

    * Naively, every path from the start to the end node should represent a valid query variant, although in - * practice it is desirable to be clever about how to evaluate the paths, to avoid combinatorial explosion. + * practice it is desirable to be clever about how to evaluate the paths, to avoid a large number of queries + * being generated. */ public class QWordGraph implements Iterable { @@ -85,6 +86,7 @@ public class QWordGraph implements Iterable { public List links() { return Collections.unmodifiableList(links); } + public List nodes() { return links.stream() .flatMap(l -> Stream.of(l.from(), l.to())) @@ -120,39 +122,6 @@ public class QWordGraph implements Iterable { .toList(); } - // Returns true if removing the word would disconnect the graph - // so that there is no path from 'begin' to 'end'. This is useful - // in breaking up the graph into smaller component subgraphs, and - // understanding which vertexes can be re-ordered without changing - // the semantics of the encoded query. - public boolean isBypassed(QWord word, QWord begin, QWord end) { - Set edge = new HashSet<>(); - Set visited = new HashSet<>(); - - edge.add(begin); - - while (!edge.isEmpty()) { - Set next = new HashSet<>(); - - for (var w : edge) { - // Skip the word we're trying find a bypassing route for - if (w.ord() == word.ord()) - continue; - - if (Objects.equals(w, end)) - return true; - - next.addAll(getNext(w)); - } - - next.removeAll(visited); - visited.addAll(next); - edge = next; - } - - return false; - } - public Map> forwardReachability() { Map> ret = new HashMap<>(); diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index f3201b9d..9c47e980 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -9,40 +9,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; class QWordGraphTest { - @Test - public void testAddConstructor() { - QWordGraph graph = new QWordGraph("hello", "world"); - - System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); - System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); - System.out.println(graph.compileToQuery()); - graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); - graph.links().forEach(System.out::println); - System.out.println("--"); - graph.nodes().forEach(System.out::println); - System.out.println("--"); - graph.addVariant(graph.nodes().get(1), "sup"); - System.out.println(graph.compileToQuery()); - graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); - System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); - System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); - System.out.println("--"); - graph.links().forEach(System.out::println); - System.out.println("--"); - graph.nodes().forEach(System.out::println); - - graph.addVariantForSpan(graph.nodes().get(1), graph.nodes().get(2), "heyall"); - graph.addVariant(graph.nodes().get(2), "globe"); - System.out.println(graph.compileToQuery()); - System.out.println(graph.isBypassed(graph.nodes().get(1), QWord.beg(), QWord.end())); - System.out.println(graph.isBypassed(graph.nodes().get(2), QWord.beg(), QWord.end())); - System.out.println("--"); - graph.links().forEach(System.out::println); - System.out.println("--"); - graph.nodes().forEach(System.out::println); - graph.forwardReachability().entrySet().stream().sorted(Comparator.comparing(e -> e.getKey().ord())).forEach(System.out::println); - } - @Test void forwardReachability() { // Construct a graph like From 8cb9455c324d21ff64459c46eab97f847ce777c2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 29 Mar 2024 12:40:27 +0100 Subject: [PATCH 18/90] (qs, WIP) Fix edge cases in query compilation This addresses the relatively common case where the graph consists of two segments, such as x y, z w; in this case we want an output like (x_y) (z w | z_w) | x y (z_w). The generated output does somewhat pessimize a few other cases, but this one is arguably more important. --- .../query_parser/QueryExpansion.java | 13 +++++---- .../searchquery/query_parser/model/QWord.java | 4 +++ .../query_parser/model/QWordGraph.java | 14 +++++++--- .../model/QWordGraphPathLister.java | 2 +- .../model/QWordPathsRenderer.java | 27 +++++++++++-------- .../query_parser/model/QWordGraphTest.java | 2 +- 6 files changed, 41 insertions(+), 21 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index c216918e..6415751b 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -4,6 +4,7 @@ import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.model.QWord; import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; +import nu.marginalia.functions.searchquery.query_parser.model.QWordPathsRenderer; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; @@ -11,6 +12,8 @@ import org.apache.commons.lang3.StringUtils; import java.util.ArrayList; import java.util.List; import java.util.regex.Pattern; +import java.util.stream.Collectors; +import java.util.stream.IntStream; public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); @@ -32,7 +35,7 @@ public class QueryExpansion { this.lexicon = lexicon; } - public QWordGraph expandQuery(List words) { + public String expandQuery(List words) { QWordGraph graph = new QWordGraph(words); @@ -40,7 +43,7 @@ public class QueryExpansion { strategy.expand(graph); } - return graph; + return QWordPathsRenderer.render(graph); } private static final Pattern dashPattern = Pattern.compile("-"); @@ -98,16 +101,16 @@ public class QueryExpansion { nodes.add(qw); } - String[] words = nodes.stream().map(QWord::word).toArray(String[]::new); + String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { for (var segment : lexicon.findSegments(length, words)) { int start = segment.start(); int end = segment.start() + segment.length(); - var word = StringUtils.join(words, "_", start, end); + var word = IntStream.range(start, end).mapToObj(nodes::get).map(QWord::word).collect(Collectors.joining("_")); - graph.addVariantForSpan(nodes.get(start), nodes.get(end), word); + graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java index b7c4e594..eac2e68b 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java @@ -44,4 +44,8 @@ public record QWord( public QWord(int ord, QWord original, String word) { this(ord, true, ps.stemWord(word), word, original.original); } + + public String toString() { + return STR."q{\{word}}"; + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 4da9a6d1..a8b1a768 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -50,9 +50,9 @@ public class QWordGraph implements Iterable { var newWord = new QWord(wordId++, original, word); - for (var prev : getPrevOriginal(original)) + for (var prev : getPrev(original)) addLink(prev, newWord); - for (var next : getNextOriginal(original)) + for (var next : getNext(original)) addLink(newWord, next); } @@ -236,7 +236,15 @@ public class QWordGraph implements Iterable { public String compileToQuery() { return QWordPathsRenderer.render(this); } - + public String compileToDot() { + StringBuilder sb = new StringBuilder(); + sb.append("digraph {\n"); + for (var link : links) { + sb.append(STR."\"\{link.from().word()}\" -> \"\{link.to.word()}\";\n"); + } + sb.append("}\n"); + return sb.toString(); + } @NotNull @Override diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java index 979a419b..f26c01f7 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphPathLister.java @@ -16,7 +16,7 @@ public class QWordGraphPathLister { this.graph = graph; } - static Set listPaths(QWordGraph graph) { + public static Set listPaths(QWordGraph graph) { return new QWordGraphPathLister(graph).listPaths(); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java index 762a7d1b..b1ee7956 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordPathsRenderer.java @@ -6,7 +6,7 @@ import java.util.stream.Collectors; /** Renders a set of QWordPaths into a human-readable infix-style expression. It's not guaranteed to find * the globally optimal expression, but rather uses a greedy algorithm as a tradeoff in effort to outcome. */ -class QWordPathsRenderer { +public class QWordPathsRenderer { private final Set paths; private QWordPathsRenderer(Collection paths) { @@ -41,7 +41,7 @@ class QWordPathsRenderer { // Find the commonality of words in the paths - Map commonality = nodeCommonality(); + Map commonality = nodeCommonality(paths); // Break the words into two categories: those that are common to all paths, and those that are not @@ -82,32 +82,30 @@ class QWordPathsRenderer { } } else if (commonality.size() > 1) { // The case where no words are common to all paths - // Sort the words by commonality, so that we can consider the most common words first - List byCommonality = commonality.entrySet().stream().sorted(Map.Entry.comparingByValue()).map(Map.Entry::getKey).collect(Collectors.toList()).reversed(); + // Sort the words by commonality, so that we can consider the most common words first Map> pathsByCommonWord = new HashMap<>(); // Mutable copy of the paths List allDivergentPaths = new ArrayList<>(paths); // Break the paths into branches by the first common word they contain, in order of decreasing commonality - for (var commonWord : byCommonality) { - if (allDivergentPaths.isEmpty()) - break; + while (!allDivergentPaths.isEmpty()) { + QWord mostCommon = mostCommonQWord(allDivergentPaths); var iter = allDivergentPaths.iterator(); while (iter.hasNext()) { var path = iter.next(); - if (!path.contains(commonWord)) { + if (!path.contains(mostCommon)) { continue; } // Remove the common word from the path - var newPath = path.without(commonWord); + var newPath = path.without(mostCommon); pathsByCommonWord - .computeIfAbsent(commonWord, k -> new ArrayList<>()) + .computeIfAbsent(mostCommon, k -> new ArrayList<>()) .add(newPath); // Remove the path from the list of divergent paths since we've now accounted for it and @@ -136,9 +134,16 @@ class QWordPathsRenderer { } /** Compute how many paths each word is part of */ - private Map nodeCommonality() { + private static Map nodeCommonality(Collection paths) { return paths.stream().flatMap(QWordPath::stream) .collect(Collectors.groupingBy(w -> w, Collectors.summingInt(w -> 1))); } + private static QWord mostCommonQWord(Collection paths) { + assert !paths.isEmpty(); + return nodeCommonality(paths).entrySet().stream() + .max(Map.Entry.comparingByValue()) + .map(Map.Entry::getKey) + .orElseThrow(); + } } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index 9c47e980..f985cd13 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -110,6 +110,6 @@ class QWordGraphTest { QWordGraph graph = new QWordGraph("a", "b", "c"); graph.addVariant(graph.node("c"), "d"); graph.addVariant(graph.node("b"), "e"); - assertEquals("a ( b ( c | d ) | c e )", graph.compileToQuery()); + assertEquals("a ( c ( b | e ) | d ( b | e ) )", graph.compileToQuery()); } } \ No newline at end of file From a3a6d6292b5df79867b0c5d5c2d19dc4b434481a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 4 Apr 2024 20:17:58 +0200 Subject: [PATCH 19/90] (qs, index) New query model integrated with index service. Seems to work, tests are green and initial testing finds no errors. Still a bit untested, committing WIP as-is because it would suck to lose weeks of work due to a drive failure or something. --- code/functions/search-query/api/build.gradle | 1 + .../api/searchquery/IndexProtobufCodec.java | 37 ++-- .../api/searchquery/QueryProtobufCodec.java | 21 +-- .../model/compiled/CompiledQuery.java | 76 ++++++++ .../model/compiled/CompiledQueryLong.java | 42 +++++ .../model/compiled/CompiledQueryParser.java | 113 ++++++++++++ .../searchquery/model/compiled/CqData.java | 51 ++++++ .../model/compiled/CqDataLong.java | 27 +++ .../model/compiled/CqExpression.java | 170 ++++++++++++++++++ .../aggregate/CompiledQueryAggregates.java | 46 +++++ .../aggregate/CqBooleanAggregate.java | 40 +++++ .../aggregate/CqDoubleSumOperator.java | 40 +++++ .../aggregate/CqIntMaxMinOperator.java | 41 +++++ .../aggregate/CqLongBitmaskOperator.java | 40 +++++ .../aggregate/CqQueryPathsOperator.java | 75 ++++++++ .../model/query/QueryResponse.java | 6 +- .../{SearchSubquery.java => SearchQuery.java} | 26 +-- .../model/query/SearchSpecification.java | 2 +- .../model/results/SearchResultItem.java | 4 +- .../results/SearchResultKeywordScore.java | 9 +- .../api/src/main/protobuf/query-api.proto | 16 +- .../compiled/CompiledQueryParserTest.java | 79 ++++++++ .../CompiledQueryAggregatesTest.java | 35 ++++ .../index/client/IndexProtobufCodecTest.java | 7 +- .../searchquery/svc/QueryFactory.java | 50 +++--- .../svc/QuerySearchTermsAccumulator.java | 8 +- .../query/svc/QueryFactoryTest.java | 33 ++-- .../index/ReverseIndexEntrySource.java | 2 +- .../nu/marginalia/index/IndexGrpcService.java | 32 ++-- .../index/index/CombinedIndexReader.java | 7 + .../index/index/IndexQueryBuilderImpl.java | 16 ++ .../index/index/QueryBranchWalker.java | 78 ++++++++ .../marginalia/index/index/StatefulIndex.java | 105 +++++++---- .../index/model/SearchParameters.java | 27 +-- .../marginalia/index/model/SearchTerms.java | 26 ++- .../index/model/SearchTermsUtil.java | 20 --- .../index/results/IndexMetadataService.java | 43 ++--- .../results/IndexResultValuationContext.java | 109 +++++------ .../results/IndexResultValuatorService.java | 28 +-- .../ranking/results/ResultValuator.java | 36 ++-- .../ranking/results/factors/Bm25Factor.java | 29 ++- .../results/factors/TermCoherenceFactor.java | 17 +- .../index/query/IndexQueryBuilder.java | 3 + .../index/query/filter/QueryFilterAllOf.java | 57 ++++++ .../index/query/filter/QueryFilterAnyOf.java | 35 ++-- .../query/filter/QueryFilterLetThrough.java | 2 +- .../index/query/filter/QueryFilterNoPass.java | 2 +- .../QueryFilterStepExcludeFromPredicate.java | 2 +- .../filter/QueryFilterStepFromPredicate.java | 2 +- .../query/filter/QueryFilterStepIfTest.java | 26 +++ ...IndexQueryServiceIntegrationSmokeTest.java | 22 ++- .../IndexQueryServiceIntegrationTest.java | 106 ++++++----- .../index/index/QueryBranchWalkerTest.java | 59 ++++++ .../IndexResultDomainDeduplicatorTest.java | 5 +- .../ranking/results/ResultValuatorTest.java | 18 +- .../factors/TermCoherenceFactorTest.java | 19 +- .../marginalia/array/algo/LongArrayBase.java | 8 + .../array/buffer/LongQueryBuffer.java | 43 +++-- .../array/algo/LongArraySearchTest.java | 4 +- .../java/nu/marginalia/btree/BTreeReader.java | 4 +- .../BTreeReaderRejectRetainWithIndexTest.java | 6 +- ...reeReaderRejectRetainWithoutIndexTest.java | 7 +- .../search/SearchQueryParamFactory.java | 4 +- .../search/command/SearchAdtechParameter.java | 4 +- .../search/command/SearchJsParameter.java | 4 +- .../search/model/SearchProfile.java | 4 +- 66 files changed, 1613 insertions(+), 503 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java rename code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/{SearchSubquery.java => SearchQuery.java} (76%) create mode 100644 code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java create mode 100644 code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java create mode 100644 code/index/java/nu/marginalia/index/index/QueryBranchWalker.java create mode 100644 code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java create mode 100644 code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index 727b5b86..1a8d55d2 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -30,6 +30,7 @@ dependencies { implementation libs.notnull implementation libs.guice implementation libs.gson + implementation libs.commons.lang3 implementation libs.bundles.protobuf implementation libs.bundles.grpc implementation libs.fastutil diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 4b2f0032..4d2cf7a6 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -1,7 +1,6 @@ package nu.marginalia.api.searchquery; -import nu.marginalia.api.searchquery.*; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; @@ -45,33 +44,37 @@ public class IndexProtobufCodec { .build(); } - public static SearchSubquery convertSearchSubquery(RpcSubquery subquery) { + public static SearchQuery convertRpcQuery(RpcQuery query) { List> coherences = new ArrayList<>(); - for (int j = 0; j < subquery.getCoherencesCount(); j++) { - var coh = subquery.getCoherences(j); + for (int j = 0; j < query.getCoherencesCount(); j++) { + var coh = query.getCoherences(j); coherences.add(new ArrayList<>(coh.getCoherencesList())); } - return new SearchSubquery( - subquery.getIncludeList(), - subquery.getExcludeList(), - subquery.getAdviceList(), - subquery.getPriorityList(), + return new SearchQuery( + query.getCompiledQuery(), + query.getIncludeList(), + query.getExcludeList(), + query.getAdviceList(), + query.getPriorityList(), coherences ); } - public static RpcSubquery convertSearchSubquery(SearchSubquery searchSubquery) { + public static RpcQuery convertRpcQuery(SearchQuery searchQuery) { var subqueryBuilder = - RpcSubquery.newBuilder() - .addAllAdvice(searchSubquery.getSearchTermsAdvice()) - .addAllExclude(searchSubquery.getSearchTermsExclude()) - .addAllInclude(searchSubquery.getSearchTermsInclude()) - .addAllPriority(searchSubquery.getSearchTermsPriority()); - for (var coherences : searchSubquery.searchTermCoherences) { + RpcQuery.newBuilder() + .setCompiledQuery(searchQuery.compiledQuery) + .addAllInclude(searchQuery.getSearchTermsInclude()) + .addAllAdvice(searchQuery.getSearchTermsAdvice()) + .addAllExclude(searchQuery.getSearchTermsExclude()) + .addAllPriority(searchQuery.getSearchTermsPriority()); + + for (var coherences : searchQuery.searchTermCoherences) { subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences); } + return subqueryBuilder.build(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 28d14c82..f0113870 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -14,7 +13,6 @@ import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.QueryResponse; import java.util.ArrayList; -import java.util.List; public class QueryProtobufCodec { @@ -23,9 +21,7 @@ public class QueryProtobufCodec { builder.addAllDomains(request.getDomainIdsList()); - for (var subquery : query.specs.subqueries) { - builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery)); - } + builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query)); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setHumanQuery(request.getHumanQuery()); @@ -51,9 +47,7 @@ public class QueryProtobufCodec { public static RpcIndexQuery convertQuery(String humanQuery, ProcessedQuery query) { var builder = RpcIndexQuery.newBuilder(); - for (var subquery : query.specs.subqueries) { - builder.addSubqueries(IndexProtobufCodec.convertSearchSubquery(subquery)); - } + builder.setQuery(IndexProtobufCodec.convertRpcQuery(query.specs.query)); builder.setSearchSetIdentifier(query.specs.searchSetIdentifier); builder.setHumanQuery(humanQuery); @@ -147,8 +141,8 @@ public class QueryProtobufCodec { private static SearchResultKeywordScore convertKeywordScore(RpcResultKeywordScore keywordScores) { return new SearchResultKeywordScore( - keywordScores.getSubquery(), keywordScores.getKeyword(), + -1, // termId is internal to index service keywordScores.getEncodedWordMetadata(), keywordScores.getEncodedDocMetadata(), keywordScores.getHtmlFeatures() @@ -156,14 +150,8 @@ public class QueryProtobufCodec { } private static SearchSpecification convertSearchSpecification(RpcIndexQuery specs) { - List subqueries = new ArrayList<>(specs.getSubqueriesCount()); - - for (int i = 0; i < specs.getSubqueriesCount(); i++) { - subqueries.add(IndexProtobufCodec.convertSearchSubquery(specs.getSubqueries(i))); - } - return new SearchSpecification( - subqueries, + IndexProtobufCodec.convertRpcQuery(specs.getQuery()), specs.getDomainsList(), specs.getSearchSetIdentifier(), specs.getHumanQuery(), @@ -182,7 +170,6 @@ public class QueryProtobufCodec { .addAllDomainIds(params.domainIds()) .addAllTacitAdvice(params.tacitAdvice()) .addAllTacitExcludes(params.tacitExcludes()) - .addAllTacitIncludes(params.tacitIncludes()) .addAllTacitPriority(params.tacitPriority()) .setHumanQuery(params.humanQuery()) .setQueryLimits(IndexProtobufCodec.convertQueryLimits(params.limits())) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java new file mode 100644 index 00000000..3ae850a3 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -0,0 +1,76 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.function.*; +import java.util.stream.IntStream; +import java.util.stream.Stream; + + +/** A compiled index service query. The class separates the topology of the query from the data, + * and it's possible to create new queries supplanting the data */ +public class CompiledQuery implements Iterable { + + /** The root expression, conveys the topology of the query */ + public final CqExpression root; + + private final CqData data; + + public CompiledQuery(CqExpression root, CqData data) { + this.root = root; + this.data = data; + } + + public CompiledQuery(CqExpression root, T[] data) { + this.root = root; + this.data = new CqData<>(data); + } + + /** Exists for testing, creates a simple query that ANDs all the provided items */ + public static CompiledQuery just(T... item) { + return new CompiledQuery<>(new CqExpression.And( + IntStream.range(0, item.length).mapToObj(CqExpression.Word::new).toList() + ), item); + } + + /** Create a new CompiledQuery mapping the leaf nodes using the provided mapper */ + public CompiledQuery map(Class clazz, Function mapper) { + return new CompiledQuery<>( + root, + data.map(clazz, mapper) + ); + } + + public CompiledQueryLong mapToLong(ToLongFunction mapper) { + return new CompiledQueryLong(root, data.mapToLong(mapper)); + } + + public CqExpression root() { + return root; + } + + public Stream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public T at(int index) { + return data.get(index); + } + + @NotNull + @Override + public Iterator iterator() { + return stream().iterator(); + } + + public int size() { + return data.size(); + } + + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java new file mode 100644 index 00000000..639778dc --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -0,0 +1,42 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.stream.IntStream; +import java.util.stream.LongStream; + + +/** A compiled index service query */ +public class CompiledQueryLong implements Iterable { + private final CqExpression root; + private final CqDataLong data; + + public CompiledQueryLong(CqExpression root, CqDataLong data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public LongStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + @NotNull + @Override + public Iterator iterator() { + return stream().iterator(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java new file mode 100644 index 00000000..ae197fb9 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java @@ -0,0 +1,113 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.apache.commons.lang3.StringUtils; + +import java.util.*; + +/** Parser for a compiled index query */ +public class CompiledQueryParser { + + public static CompiledQuery parse(String query) { + List parts = tokenize(query); + + if (parts.isEmpty()) { + return new CompiledQuery<>( + CqExpression.empty(), + new CqData<>(new String[0]) + ); + } + + // We aren't interested in a binary tree representation, but an n-ary tree one, + // so a somewhat unusual parsing technique is used to avoid having an additional + // flattening step at the end. + + // This is only possible due to the trivial and unambiguous grammar of the compiled queries + + List parenState = new ArrayList<>(); + parenState.add(new AndOrState()); + + Map wordIds = new HashMap<>(); + + for (var part : parts) { + var head = parenState.getLast(); + + if (part.equals("|")) { + head.or(); + } + else if (part.equals("(")) { + parenState.addLast(new AndOrState()); + } + else if (part.equals(")")) { + if (parenState.size() < 2) { + throw new IllegalStateException("Mismatched parentheses in expression: " + query); + } + parenState.removeLast(); + parenState.getLast().and(head.closeOr()); + } + else { + head.and( + new CqExpression.Word( + wordIds.computeIfAbsent(part, p -> wordIds.size()) + ) + ); + } + } + + if (parenState.size() != 1) + throw new IllegalStateException("Mismatched parentheses in expression: " + query); + + // Construct the CompiledQuery object with String:s as leaves + var root = parenState.getLast().closeOr(); + + String[] cqData = new String[wordIds.size()]; + wordIds.forEach((w, i) -> cqData[i] = w); + return new CompiledQuery<>(root, new CqData<>(cqData)); + + } + + private static class AndOrState { + private List andState = new ArrayList<>(); + private List orState = new ArrayList<>(); + + /** Add a new item to the and-list */ + public void and(CqExpression e) { + andState.add(e); + } + + /** Turn the and-list into an expression on the or-list, and then start a new and-list */ + public void or() { + closeAnd(); + + andState = new ArrayList<>(); + } + + /** Turn the and-list into an And-expression in the or-list */ + private void closeAnd() { + if (andState.size() == 1) + orState.add(andState.getFirst()); + else if (!andState.isEmpty()) + orState.add(new CqExpression.And(andState)); + } + + /** Finalize the current and-list, then turn the or-list into an Or-expression */ + public CqExpression closeOr() { + closeAnd(); + + if (orState.isEmpty()) + return CqExpression.empty(); + if (orState.size() == 1) + return orState.getFirst(); + + return new CqExpression.Or(orState); + } + } + + private static List tokenize(String query) { + // Each token is guaranteed to be separated by one or more space characters + + return Arrays.stream(StringUtils.split(query, ' ')) + .filter(StringUtils::isNotBlank) + .toList(); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java new file mode 100644 index 00000000..b1565dc0 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -0,0 +1,51 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.lang.reflect.Array; +import java.util.Arrays; +import java.util.function.Function; +import java.util.function.ToDoubleFunction; +import java.util.function.ToLongFunction; +import java.util.stream.Stream; + +public class CqData { + private final T[] data; + + public CqData(T[] data) { + this.data = data; + } + + @SuppressWarnings("unchecked") + public CqData map(Class clazz, Function mapper) { + T2[] newData = (T2[]) Array.newInstance(clazz, data.length); + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.apply((T) data[i]); + } + + return new CqData<>(newData); + } + + public CqDataLong mapToLong(ToLongFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsLong((T) data[i]); + } + + return new CqDataLong(newData); + } + + public T get(int i) { + return data[i]; + } + + public T get(CqExpression.Word w) { + return data[w.idx()]; + } + + public Stream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java new file mode 100644 index 00000000..8049631e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java @@ -0,0 +1,27 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.LongStream; + +public class CqDataLong { + private final long[] data; + + public CqDataLong(long[] data) { + this.data = data; + } + + public long get(int i) { + return data[i]; + } + public long get(CqExpression.Word w) { + return data[w.idx()]; + } + + public LongStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java new file mode 100644 index 00000000..e9972526 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java @@ -0,0 +1,170 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.List; +import java.util.StringJoiner; +import java.util.stream.Stream; + +/** Expression in a parsed index service query + * + */ +public sealed interface CqExpression { + + Stream stream(); + + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + long visit(LongVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + double visit(DoubleVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + int visit(IntVisitor visitor); + /** @see nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates */ + boolean visit(BoolVisitor visitor); + + T visit(ObjectVisitor visitor); + + static CqExpression empty() { + return new Or(List.of()); + } + + + record And(List parts) implements CqExpression { + @Override + public Stream stream() { + return parts.stream().flatMap(CqExpression::stream); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onAnd(parts); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onAnd(parts); } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "And[ ", "]"); + parts.forEach(part -> sj.add(part.toString())); + return sj.toString(); + } + + } + + record Or(List parts) implements CqExpression { + @Override + public Stream stream() { + return parts.stream().flatMap(CqExpression::stream); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onOr(parts); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onOr(parts); } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "Or[ ", "]"); + parts.forEach(part -> sj.add(part.toString())); + return sj.toString(); + } + + + } + + record Word(int idx) implements CqExpression { + @Override + public Stream stream() { + return Stream.of(this); + } + + @Override + public long visit(LongVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public double visit(DoubleVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public int visit(IntVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public boolean visit(BoolVisitor visitor) { + return visitor.onLeaf(idx); + } + + @Override + public T visit(ObjectVisitor visitor) { return visitor.onLeaf(idx); } + + @Override + public String toString() { + return Integer.toString(idx); + } + } + + interface LongVisitor { + long onAnd(List parts); + long onOr(List parts); + long onLeaf(int idx); + } + + interface IntVisitor { + int onAnd(List parts); + int onOr(List parts); + int onLeaf(int idx); + } + + interface BoolVisitor { + boolean onAnd(List parts); + boolean onOr(List parts); + boolean onLeaf(int idx); + } + + interface DoubleVisitor { + double onAnd(List parts); + double onOr(List parts); + double onLeaf(int idx); + } + + interface ObjectVisitor { + T onAnd(List parts); + T onOr(List parts); + T onLeaf(int idx); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java new file mode 100644 index 00000000..209acbee --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -0,0 +1,46 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.*; + +public class CompiledQueryAggregates { + /** Compiled query aggregate that for a single boolean that treats or-branches as logical OR, + * and and-branches as logical AND operations. Will return true if there exists a path through + * the query where the provided predicate returns true for each item. + */ + static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } + + + /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, + * and and-branches as logical AND operations. + */ + public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } + + + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + + /** Apply the operator to each leaf node, and then return the highest sum of values possible + * through each branch in the compiled query. + * + */ + public static double doubleSumAggregate(CompiledQuery query, ToDoubleFunction operator) { + return query.root.visit(new CqDoubleSumOperator(query, operator)); + } + + /** Enumerate all possible paths through the compiled query */ + public static List queriesAggregate(CompiledQueryLong query) { + return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java new file mode 100644 index 00000000..05ebf4c7 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntPredicate; +import java.util.function.Predicate; + +public class CqBooleanAggregate implements CqExpression.BoolVisitor { + + private final IntPredicate predicate; + + public CqBooleanAggregate(CompiledQuery query, Predicate objPred) { + this.predicate = idx -> objPred.test(query.at(idx)); + } + + @Override + public boolean onAnd(List parts) { + for (var part : parts) { + if (!part.visit(this)) // short-circuit + return false; + } + return true; + } + + @Override + public boolean onOr(List parts) { + for (var part : parts) { + if (part.visit(this)) // short-circuit + return true; + } + return false; + } + + @Override + public boolean onLeaf(int idx) { + return predicate.test(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java new file mode 100644 index 00000000..23d1904e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToDoubleFunction; +import java.util.function.ToDoubleFunction; + +public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { + + private final IntToDoubleFunction operator; + + public CqDoubleSumOperator(CompiledQuery query, ToDoubleFunction operator) { + this.operator = idx -> operator.applyAsDouble(query.at(idx)); + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.max(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + return operator.applyAsDouble(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java new file mode 100644 index 00000000..b3ec86bb --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -0,0 +1,41 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntUnaryOperator; +import java.util.function.ToIntFunction; + +public class CqIntMaxMinOperator implements CqExpression.IntVisitor { + + private final IntUnaryOperator operator; + + + public CqIntMaxMinOperator(CompiledQuery query, ToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + + @Override + public int onAnd(List parts) { + int value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.min(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public int onOr(List parts) { + int value = parts.getFirst().visit(this); + for (int i = 1; i < parts.size(); i++) { + value = Math.max(value, parts.get(i).visit(this)); + } + return value; + } + + @Override + public int onLeaf(int idx) { + return operator.applyAsInt(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java new file mode 100644 index 00000000..d9a4804b --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -0,0 +1,40 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToLongFunction; +import java.util.function.ToLongFunction; + +public class CqLongBitmaskOperator implements CqExpression.LongVisitor { + + private final IntToLongFunction operator; + + public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } + + @Override + public long onAnd(List parts) { + long value = ~0L; + for (var part : parts) { + value &= part.visit(this); + } + return value; + } + + @Override + public long onOr(List parts) { + long value = 0L; + for (var part : parts) { + value |= part.visit(this); + } + return value; + } + + @Override + public long onLeaf(int idx) { + return operator.applyAsLong(idx); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java new file mode 100644 index 00000000..2339104e --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqQueryPathsOperator.java @@ -0,0 +1,75 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.ArrayList; +import java.util.List; + +public class CqQueryPathsOperator implements CqExpression.ObjectVisitor> { + private final CompiledQueryLong query; + + public CqQueryPathsOperator(CompiledQueryLong query) { + this.query = query; + } + + @Override + public List onAnd(List parts) { + return parts.stream() + .map(expr -> expr.visit(this)) + .reduce(List.of(), this::combineAnd); + } + + private List combineAnd(List a, List b) { + // No-op cases + if (a.isEmpty()) + return b; + if (b.isEmpty()) + return a; + + // Simple cases + if (a.size() == 1) { + b.forEach(set -> set.addAll(a.getFirst())); + return b; + } + else if (b.size() == 1) { + a.forEach(set -> set.addAll(b.getFirst())); + return a; + } + + // Case where we AND two ORs + List ret = new ArrayList<>(); + + for (var aPart : a) { + for (var bPart : b) { + LongSet set = new LongOpenHashSet(aPart.size() + bPart.size()); + set.addAll(aPart); + set.addAll(bPart); + ret.add(set); + } + } + + return ret; + } + + @Override + public List onOr(List parts) { + List ret = new ArrayList<>(); + + for (var part : parts) { + ret.addAll(part.visit(this)); + } + + return ret; + } + + @Override + public List onLeaf(int idx) { + var set = new LongArraySet(1); + set.add(query.at(idx)); + return List.of(set); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java index 80e5b61a..1834c08f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java @@ -13,10 +13,6 @@ public record QueryResponse(SearchSpecification specs, String domain) { public Set getAllKeywords() { - Set keywords = new HashSet<>(100); - for (var sq : specs.subqueries) { - keywords.addAll(sq.searchTermsInclude); - } - return keywords; + return new HashSet<>(specs.query.searchTermsInclude); } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java similarity index 76% rename from code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java rename to code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index 3798ae89..9dd10396 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSubquery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -13,9 +13,12 @@ import java.util.stream.Collectors; @AllArgsConstructor @With @EqualsAndHashCode -public class SearchSubquery { +public class SearchQuery { - /** These terms must be present in the document and are used in ranking*/ + /** An infix style expression that encodes the required terms in the query */ + public final String compiledQuery; + + /** All terms that appear in {@see compiledQuery} */ public final List searchTermsInclude; /** These terms must be absent from the document */ @@ -33,7 +36,8 @@ public class SearchSubquery { @Deprecated // why does this exist? private double value = 0; - public SearchSubquery() { + public SearchQuery() { + this.compiledQuery = ""; this.searchTermsInclude = new ArrayList<>(); this.searchTermsExclude = new ArrayList<>(); this.searchTermsAdvice = new ArrayList<>(); @@ -41,11 +45,13 @@ public class SearchSubquery { this.searchTermCoherences = new ArrayList<>(); } - public SearchSubquery(List searchTermsInclude, - List searchTermsExclude, - List searchTermsAdvice, - List searchTermsPriority, - List> searchTermCoherences) { + public SearchQuery(String compiledQuery, + List searchTermsInclude, + List searchTermsExclude, + List searchTermsAdvice, + List searchTermsPriority, + List> searchTermCoherences) { + this.compiledQuery = compiledQuery; this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; @@ -54,7 +60,7 @@ public class SearchSubquery { } @Deprecated // why does this exist? - public SearchSubquery setValue(double value) { + public SearchQuery setValue(double value) { if (Double.isInfinite(value) || Double.isNaN(value)) { this.value = Double.MAX_VALUE; } else { @@ -66,7 +72,7 @@ public class SearchSubquery { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!searchTermsInclude.isEmpty()) sb.append("include=").append(searchTermsInclude.stream().collect(Collectors.joining(",", "[", "] "))); + if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java index be2a6895..bbb5b7ae 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java @@ -10,7 +10,7 @@ import java.util.List; @ToString @Getter @Builder @With @AllArgsConstructor public class SearchSpecification { - public List subqueries; + public SearchQuery query; /** If present and not empty, limit the search to these domain IDs */ public List domains; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index cc02ae28..8f50c9fb 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -21,9 +21,9 @@ public class SearchResultItem implements Comparable { /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId, int scoresCount) { + public SearchResultItem(long combinedId) { this.combinedId = combinedId; - this.keywordScores = new ArrayList<>(scoresCount); + this.keywordScores = new ArrayList<>(); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index b84dad0b..f5a9fc02 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -7,19 +7,22 @@ import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; public final class SearchResultKeywordScore { + @Deprecated public final int subquery; + public final long termId; public final String keyword; private final long encodedWordMetadata; private final long encodedDocMetadata; private final int htmlFeatures; - public SearchResultKeywordScore(int subquery, - String keyword, + public SearchResultKeywordScore(String keyword, + long termId, long encodedWordMetadata, long encodedDocMetadata, int htmlFeatures) { - this.subquery = subquery; + this.termId = termId; + this.subquery = -1; // FIXME, deprecated this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; this.encodedDocMetadata = encodedDocMetadata; diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index f5ec5e8d..606b18f8 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -52,7 +52,7 @@ message RpcTemporalBias { /* Index service query request */ message RpcIndexQuery { - repeated RpcSubquery subqueries = 1; + RpcQuery query = 1; repeated int32 domains = 2; // (optional) A list of domain IDs to consider string searchSetIdentifier = 3; // (optional) A named set of domains to consider string humanQuery = 4; // The search query as the user entered it @@ -102,12 +102,11 @@ message RpcRawResultItem { /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { - int32 subquery = 1; // index of the subquery this keyword relates to - string keyword = 2; // the keyword - int64 encodedWordMetadata = 3; // bit encoded word metadata - int64 encodedDocMetadata = 4; // bit encoded document metadata - bool hasPriorityTerms = 5; // true if this word is important to the document - int32 htmlFeatures = 6; // bit encoded document features + string keyword = 1; // the keyword + int64 encodedWordMetadata = 2; // bit encoded word metadata + int64 encodedDocMetadata = 3; // bit encoded document metadata + bool hasPriorityTerms = 4; // true if this word is important to the document + int32 htmlFeatures = 5; // bit encoded document features } /* Query execution parameters */ @@ -137,12 +136,13 @@ message RpcResultRankingParameters { } /* Defines a single subquery */ -message RpcSubquery { +message RpcQuery { repeated string include = 1; // These terms must be present repeated string exclude = 2; // These terms must be absent repeated string advice = 3; // These terms must be present, but do not affect ranking repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other + string compiledQuery = 6; // Compiled query in infix notation } /* Defines a group of search terms that must exist in close proximity within the document */ diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java new file mode 100644 index 00000000..47983820 --- /dev/null +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java @@ -0,0 +1,79 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; + +class CompiledQueryParserTest { + + @Test + public void testEmpty() { + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( )").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("( | )").root); + assertEquals(CqExpression.empty(), CompiledQueryParser.parse("| ( | ) |").root); + } + + @Test + public void testSingleWord() { + CompiledQuery q = CompiledQueryParser.parse("foo"); + assertEquals(w(q, "foo"), q.root); + } + + @Test + public void testAndTwoWords() { + CompiledQuery q = CompiledQueryParser.parse("foo bar"); + assertEquals(and(w(q, "foo"), w(q,"bar")), q.root); + } + + @Test + public void testOrTwoWords() { + CompiledQuery q = CompiledQueryParser.parse("foo | bar"); + assertEquals(or(w(q, "foo"), w(q,"bar")), q.root); + } + + @Test + public void testOrAndWords() { + CompiledQuery q = CompiledQueryParser.parse("foo | bar baz"); + assertEquals(or(w(q,"foo"), and(w(q,"bar"), w(q,"baz"))), q.root); + } + + @Test + public void testAndAndOrAndAndWords() { + CompiledQuery q = CompiledQueryParser.parse("foo foobar | bar baz"); + assertEquals(or( + and(w(q, "foo"), w(q, "foobar")), + and(w(q, "bar"), w(q, "baz"))) + , q.root); + } + @Test + public void testComplex1() { + CompiledQuery q = CompiledQueryParser.parse("foo ( bar | baz ) quux"); + assertEquals(and(w(q,"foo"), or(w(q, "bar"), w(q, "baz")), w(q, "quux")), q.root); + } + @Test + public void testComplex2() { + CompiledQuery q = CompiledQueryParser.parse("( ( ( a ) b ) c ) d"); + assertEquals(and(and(and(w(q, "a"), w(q, "b")), w(q, "c")), w(q, "d")), q.root); + } + + @Test + public void testNested() { + CompiledQuery q = CompiledQueryParser.parse("( ( ( a ) ) )"); + assertEquals(w(q,"a"), q.root); + } + + private CqExpression.Word w(CompiledQuery query, String word) { + return new CqExpression.Word(query.indices().filter(idx -> word.equals(query.at(idx))).findAny().orElseThrow()); + } + + private CqExpression and(CqExpression... parts) { + return new CqExpression.And(List.of(parts)); + } + + private CqExpression or(CqExpression... parts) { + return new CqExpression.Or(List.of(parts)); + } +} \ No newline at end of file diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java new file mode 100644 index 00000000..c3e36180 --- /dev/null +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregatesTest.java @@ -0,0 +1,35 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import static nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser.parse; +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class CompiledQueryAggregatesTest { + + @Test + void booleanAggregates() { + assertFalse(booleanAggregate(parse("false"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("true"), Boolean::parseBoolean)); + assertFalse(booleanAggregate(parse("false true"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( true ) | ( true false )"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( false ) | ( true )"), Boolean::parseBoolean)); + assertTrue(booleanAggregate(parse("( true false ) | ( true true )"), Boolean::parseBoolean)); + assertFalse(booleanAggregate(parse("( true false ) | ( true false )"), Boolean::parseBoolean)); + } + + @Test + void intMaxMinAggregates() { + assertEquals(5, intMaxMinAggregate(parse("5"), Integer::parseInt)); + assertEquals(3, intMaxMinAggregate(parse("5 3"), Integer::parseInt)); + assertEquals(6, intMaxMinAggregate(parse("5 3 | 6 7"), Integer::parseInt)); + } + + @Test + void doubleSumAggregates() { + assertEquals(5, (int) doubleSumAggregate(parse("5"), Double::parseDouble)); + assertEquals(8, (int) doubleSumAggregate(parse("5 3"), Double::parseDouble)); + assertEquals(13, (int) doubleSumAggregate(parse("1 ( 5 3 | 2 10 )"), Double::parseDouble)); + } +} \ No newline at end of file diff --git a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java index 1782765d..e93f715c 100644 --- a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java +++ b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java @@ -1,7 +1,7 @@ package nu.marginalia.index.client; import nu.marginalia.api.searchquery.IndexProtobufCodec; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -35,14 +35,15 @@ class IndexProtobufCodecTest { } @Test public void testSubqery() { - verifyIsIdentityTransformation(new SearchSubquery( + verifyIsIdentityTransformation(new SearchQuery( + "qs", List.of("a", "b"), List.of("c", "d"), List.of("e", "f"), List.of("g", "h"), List.of(List.of("i", "j"), List.of("k")) ), - s -> IndexProtobufCodec.convertSearchSubquery(IndexProtobufCodec.convertSearchSubquery(s)) + s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s)) ); } private void verifyIsIdentityTransformation(T val, Function transformation) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 3c0e5219..55467b4f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -2,18 +2,16 @@ package nu.marginalia.functions.searchquery.svc; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.LanguageModels; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.util.language.EnglishDictionary; +import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenType; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -26,15 +24,14 @@ import java.util.List; public class QueryFactory { private final Logger logger = LoggerFactory.getLogger(getClass()); - private static final int RETAIN_QUERY_VARIANT_COUNT = 5; private final QueryParser queryParser = new QueryParser(); + private final QueryExpansion queryExpansion; @Inject - public QueryFactory(LanguageModels lm, - TermFrequencyDict dict, - EnglishDictionary englishDictionary) + public QueryFactory(QueryExpansion queryExpansion) { + this.queryExpansion = queryExpansion; } @@ -49,8 +46,6 @@ public class QueryFactory { List searchTermsHuman = new ArrayList<>(); List problems = new ArrayList<>(); - String domain = null; - List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { @@ -74,19 +69,8 @@ public class QueryFactory { t.visit(qualityLimits); } -// var queryPermutations = queryPermutation.permuteQueriesNew(basicQuery); - List subqueries = new ArrayList<>(); QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); - domain = termsAccumulator.domain; - -// for (var parts : queryPermutations) { -// QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); -// -// domain = termsAccumulator.domain; -// -// SearchSubquery subquery = termsAccumulator.createSubquery(); -// subqueries.add(subquery); -// } + String domain = termsAccumulator.domain; List domainIds = params.domainIds(); @@ -97,7 +81,18 @@ public class QueryFactory { } var specsBuilder = SearchSpecification.builder() - .subqueries(subqueries) + .query( + new SearchQuery( + queryExpansion.expandQuery( + termsAccumulator.searchTermsInclude + ), + termsAccumulator.searchTermsInclude, + termsAccumulator.searchTermsExclude, + termsAccumulator.searchTermsAdvice, + termsAccumulator.searchTermsPriority, + termsAccumulator.searchTermCoherences + ) + ) .humanQuery(query) .quality(qualityLimits.qualityLimit) .year(qualityLimits.year) @@ -111,12 +106,9 @@ public class QueryFactory { SearchSpecification specs = specsBuilder.build(); - for (var sq : specs.subqueries) { - sq.searchTermsAdvice.addAll(params.tacitAdvice()); - sq.searchTermsPriority.addAll(params.tacitPriority()); - sq.searchTermsInclude.addAll(params.tacitIncludes()); - sq.searchTermsExclude.addAll(params.tacitExcludes()); - } + specs.query.searchTermsAdvice.addAll(params.tacitAdvice()); + specs.query.searchTermsPriority.addAll(params.tacitPriority()); + specs.query.searchTermsExclude.addAll(params.tacitExcludes()); return new ProcessedQuery(specs, searchTermsHuman, domain); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java index e4def0d0..cc3a7e56 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java @@ -1,6 +1,6 @@ package nu.marginalia.functions.searchquery.svc; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.language.WordPatterns; import nu.marginalia.functions.searchquery.query_parser.token.Token; import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; @@ -9,7 +9,7 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -/** @see SearchSubquery */ +/** @see SearchQuery */ public class QuerySearchTermsAccumulator implements TokenVisitor { public List searchTermsExclude = new ArrayList<>(); public List searchTermsInclude = new ArrayList<>(); @@ -19,10 +19,6 @@ public class QuerySearchTermsAccumulator implements TokenVisitor { public String domain; - public SearchSubquery createSubquery() { - return new SearchSubquery(searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences); - } - public QuerySearchTermsAccumulator(List parts) { for (Token t : parts) { t.visit(this); diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 24131143..132944c4 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -3,12 +3,13 @@ package nu.marginalia.query.svc; import nu.marginalia.WmsaHome; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; import nu.marginalia.functions.searchquery.svc.QueryFactory; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; -import nu.marginalia.util.language.EnglishDictionary; +import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; @@ -27,11 +28,9 @@ public class QueryFactoryTest { public static void setUpAll() throws IOException { var lm = WmsaHome.getLanguageModels(); - var tfd = new TermFrequencyDict(lm); - queryFactory = new QueryFactory(lm, - tfd, - new EnglishDictionary(tfd) + queryFactory = new QueryFactory( + new QueryExpansion(new TermFrequencyDict(lm), new NgramLexicon(lm)) ); } @@ -112,17 +111,15 @@ public class QueryFactoryTest { { // the is a stopword, so it should generate an ngram search term var specs = parseAndGetSpecs("\"the shining\""); - assertEquals(List.of("the_shining"), specs.subqueries.iterator().next().searchTermsInclude); - assertEquals(List.of(), specs.subqueries.iterator().next().searchTermsAdvice); - assertEquals(List.of(), specs.subqueries.iterator().next().searchTermCoherences); + assertEquals("the_shining", specs.query.compiledQuery); } { // tde isn't a stopword, so we should get the normal behavior var specs = parseAndGetSpecs("\"tde shining\""); - assertEquals(List.of("tde", "shining"), specs.subqueries.iterator().next().searchTermsInclude); - assertEquals(List.of("tde_shining"), specs.subqueries.iterator().next().searchTermsAdvice); - assertEquals(List.of(List.of("tde", "shining")), specs.subqueries.iterator().next().searchTermCoherences); + assertEquals("tde shining", specs.query.compiledQuery); + assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice); + assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences); } } @@ -150,8 +147,18 @@ public class QueryFactoryTest { @Test public void testPriorityTerm() { - var subquery = parseAndGetSpecs("physics ?tld:edu").subqueries.iterator().next(); + var subquery = parseAndGetSpecs("physics ?tld:edu").query; assertEquals(List.of("tld:edu"), subquery.searchTermsPriority); - assertEquals(List.of("physics"), subquery.searchTermsInclude); + assertEquals("physics", subquery.compiledQuery); + } + + @Test + public void testExpansion() { + + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("elden ring mechanical keyboard slackware linux duke nukem 3d").query; + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery.compiledQuery); + } } \ No newline at end of file diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java index 37c79941..7c12563b 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java @@ -46,7 +46,7 @@ public class ReverseIndexEntrySource implements EntrySource { return; for (int ri = entrySize, wi=1; ri < buffer.end ; ri+=entrySize, wi++) { - buffer.data[wi] = buffer.data[ri]; + buffer.data.set(wi, buffer.data.get(ri)); } buffer.end /= entrySize; diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index a47c4684..b675f749 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -9,14 +9,14 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.longs.LongArrayList; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchTerms; -import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.results.IndexResultValuatorService; @@ -143,7 +143,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) .setHtmlFeatures(score.htmlFeatures()) - .setSubquery(score.subquery) ); } @@ -203,7 +202,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.subqueries); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, + params.compiledQuery, + params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -255,14 +256,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { /** Execute a search query */ public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException { - for (var subquery : parameters.subqueries) { - var terms = new SearchTerms(subquery); - if (terms.isEmpty()) - continue; + var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds); - for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { - workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); - } + for (var indexQuery : index.createQueries(terms, parameters.queryParams)) { + workerPool.execute(new IndexLookup(indexQuery, parameters.budget)); } for (int i = 0; i < indexValuationThreads; i++) { @@ -327,7 +324,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { buffer.reset(); query.getMoreResults(buffer); - results.addElements(0, buffer.data, 0, buffer.end); + for (int i = 0; i < buffer.end; i++) { + results.add(buffer.data.get(i)); + } if (results.size() < 512) { enqueueResults(new CombinedDocIdList(results)); @@ -413,8 +412,13 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } - private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, List subqueries) { - final var termToId = SearchTermsUtil.getAllIncludeTerms(subqueries); + private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, + CompiledQuery query, + CompiledQueryLong compiledQueryIds) + { + Map termToId = new HashMap<>(query.size()); + query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); + final Map termFrequencies = new HashMap<>(termToId.size()); final Map prioFrequencies = new HashMap<>(termToId.size()); diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index ea78739c..3846bad8 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -38,6 +38,13 @@ public class CombinedIndexReader { return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); } + public QueryFilterStepIf hasWordFull(long termId) { + return reverseIndexFullReader.also(termId); + } + public QueryFilterStepIf hasWordPrio(long termId) { + return reverseIndexPriorityReader.also(termId); + } + /** Creates a query builder for terms in the priority index */ public IndexQueryBuilder findPriorityWord(long wordId) { diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 825728ae..33ca033e 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -1,9 +1,11 @@ package nu.marginalia.index.index; +import java.util.List; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.index.ReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { @@ -66,6 +68,20 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } + public IndexQueryBuilder addInclusionFilterAny(List filterSteps) { + if (filterSteps.isEmpty()) + return this; + + if (filterSteps.size() == 1) { + query.addInclusionFilter(filterSteps.getFirst()); + } + else { + query.addInclusionFilter(new QueryFilterAnyOf(filterSteps)); + } + + return this; + } + public IndexQuery build() { return query; } diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java new file mode 100644 index 00000000..a465bd86 --- /dev/null +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -0,0 +1,78 @@ +package nu.marginalia.index.index; + +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongSet; + +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; + +class QueryBranchWalker { + public final long[] priorityOrder; + public final List paths; + public final long termId; + + private QueryBranchWalker(long[] priorityOrder, List paths, long termId) { + this.priorityOrder = priorityOrder; + this.paths = paths; + this.termId = termId; + } + + public boolean atEnd() { + return priorityOrder.length == 0; + } + + public static List create(long[] priorityOrder, List paths) { + + List ret = new ArrayList<>(); + List remainingPaths = new LinkedList<>(paths); + + remainingPaths.removeIf(LongSet::isEmpty); + + for (int i = 0; i < priorityOrder.length; i++) { + long prio = priorityOrder[i]; + + var it = remainingPaths.iterator(); + List pathsForPrio = new ArrayList<>(); + + while (it.hasNext()) { + var path = it.next(); + + if (path.contains(prio)) { + path.remove(prio); + pathsForPrio.add(path); + it.remove(); + } + } + + if (!pathsForPrio.isEmpty()) { + LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size()); + + for (var p : priorityOrder) { + for (var path : pathsForPrio) { + if (path.contains(p)) { + remainingPrios.add(p); + break; + } + } + } + + ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio)); + } + } + + if (!remainingPaths.isEmpty()) { + System.out.println("Dropping: " + remainingPaths); + } + + return ret; + } + + public List next() { + if (atEnd()) + return List.of(); + + return create(priorityOrder, paths); + } + +} diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index a49e740e..0f55c0c8 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -2,6 +2,13 @@ package nu.marginalia.index.index; import com.google.inject.Inject; import com.google.inject.Singleton; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.index.query.filter.QueryFilterAllOf; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; +import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.DocMetadataList; import nu.marginalia.index.model.QueryParams; @@ -14,12 +21,13 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.function.LongFunction; +import java.util.function.Predicate; +import java.util.stream.Collectors; /** This class delegates SearchIndexReader and deals with the stateful nature of the index, * i.e. it may be possible to reconstruct the index and load a new set of data. @@ -105,6 +113,61 @@ public class StatefulIndex { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } + private Predicate containsOnly(long[] permitted) { + LongSet permittedTerms = new LongOpenHashSet(permitted); + return permittedTerms::containsAll; + } + + private List createBuilders(CompiledQueryLong query, + LongFunction builderFactory, + long[] termPriority) { + List paths = CompiledQueryAggregates.queriesAggregate(query); + + // Remove any paths that do not contain all prioritized terms, as this means + // the term is missing from the index and can never be found + paths.removeIf(containsOnly(termPriority).negate()); + + List helpers = QueryBranchWalker.create(termPriority, paths); + List builders = new ArrayList<>(); + + for (var helper : helpers) { + var builder = builderFactory.apply(helper.termId); + + builders.add(builder); + + if (helper.atEnd()) + continue; + + var filters = helper.next().stream() + .map(this::createFilter) + .toList(); + + builder.addInclusionFilterAny(filters); + } + + return builders; + } + + private QueryFilterStepIf createFilter(QueryBranchWalker helper) { + var selfCondition = combinedIndexReader.hasWordFull(helper.termId); + if (helper.atEnd()) + return selfCondition; + + var nextSteps = helper.next(); + var nextFilters = nextSteps.stream() + .map(this::createFilter) + .map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter))) + .collect(Collectors.toList()); + + if (nextFilters.isEmpty()) + return selfCondition; + + if (nextFilters.size() == 1) + return nextFilters.getFirst(); + + + return new QueryFilterAnyOf(nextFilters); + } public List createQueries(SearchTerms terms, QueryParams params) { @@ -117,40 +180,13 @@ public class StatefulIndex { final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); List queryHeads = new ArrayList<>(10); + + queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes)); + queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio)); + List queries = new ArrayList<>(10); - // To ensure that good results are discovered, create separate query heads for the priority index that - // filter for terms that contain pairs of two search terms - if (orderedIncludesPrio.length > 1) { - for (int i = 0; i + 1 < orderedIncludesPrio.length; i++) { - for (int j = i + 1; j < orderedIncludesPrio.length; j++) { - var entrySource = combinedIndexReader - .findPriorityWord(orderedIncludesPrio[i]) - .alsoPrio(orderedIncludesPrio[j]); - queryHeads.add(entrySource); - } - } - } - - // Next consider entries that appear only once in the priority index - for (var wordId : orderedIncludesPrio) { - queryHeads.add(combinedIndexReader.findPriorityWord(wordId)); - } - - // Finally consider terms in the full index - queryHeads.add(combinedIndexReader.findFullWord(orderedIncludes[0])); - for (var query : queryHeads) { - if (query == null) { - return Collections.emptyList(); - } - - // Note that we can add all includes as filters, even though - // they may not be present in the query head, as the query builder - // will ignore redundant include filters: - for (long orderedInclude : orderedIncludes) { - query = query.alsoFull(orderedInclude); - } for (long term : terms.excludes()) { query = query.notFull(term); @@ -161,6 +197,7 @@ public class StatefulIndex { queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); } + return queries; } diff --git a/code/index/java/nu/marginalia/index/model/SearchParameters.java b/code/index/java/nu/marginalia/index/model/SearchParameters.java index 7db25341..f0e851e5 100644 --- a/code/index/java/nu/marginalia/index/model/SearchParameters.java +++ b/code/index/java/nu/marginalia/index/model/SearchParameters.java @@ -2,16 +2,16 @@ package nu.marginalia.index.model; import nu.marginalia.api.searchquery.IndexProtobufCodec; import nu.marginalia.api.searchquery.RpcIndexQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryParser; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.IndexSearchBudget; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.searchset.SearchSet; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.api.searchquery.IndexProtobufCodec.convertSpecLimit; public class SearchParameters { @@ -21,13 +21,16 @@ public class SearchParameters { */ public final int fetchSize; public final IndexSearchBudget budget; - public final List subqueries; + public final SearchQuery query; public final QueryParams queryParams; public final ResultRankingParameters rankingParams; public final int limitByDomain; public final int limitTotal; + public final CompiledQuery compiledQuery; + public final CompiledQueryLong compiledQueryIds; + // mutable: /** @@ -40,7 +43,7 @@ public class SearchParameters { this.fetchSize = limits.fetchSize(); this.budget = new IndexSearchBudget(limits.timeoutMs()); - this.subqueries = specsSet.subqueries; + this.query = specsSet.query; this.limitByDomain = limits.resultsByDomain(); this.limitTotal = limits.resultsTotal(); @@ -52,6 +55,9 @@ public class SearchParameters { searchSet, specsSet.queryStrategy); + compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); + compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); + rankingParams = specsSet.rankingParams; } @@ -63,11 +69,8 @@ public class SearchParameters { // The time budget is halved because this is the point when we start to // wrap up the search and return the results. this.budget = new IndexSearchBudget(limits.timeoutMs() / 2); + this.query = IndexProtobufCodec.convertRpcQuery(request.getQuery()); - this.subqueries = new ArrayList<>(request.getSubqueriesCount()); - for (int i = 0; i < request.getSubqueriesCount(); i++) { - this.subqueries.add(IndexProtobufCodec.convertSearchSubquery(request.getSubqueries(i))); - } this.limitByDomain = limits.resultsByDomain(); this.limitTotal = limits.resultsTotal(); @@ -79,9 +82,13 @@ public class SearchParameters { searchSet, QueryStrategy.valueOf(request.getQueryStrategy())); + compiledQuery = CompiledQueryParser.parse(this.query.compiledQuery); + compiledQueryIds = compiledQuery.mapToLong(SearchTermsUtil::getWordId); + rankingParams = IndexProtobufCodec.convertRankingParameterss(request.getParameters()); } + public long getDataCost() { return dataCost; } diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index c32b1aa3..307e4179 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -4,7 +4,8 @@ import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import java.util.ArrayList; import java.util.List; @@ -18,34 +19,39 @@ public final class SearchTerms { private final LongList priority; private final List coherences; + private final CompiledQueryLong compiledQueryIds; + public SearchTerms( LongList includes, LongList excludes, LongList priority, - List coherences + List coherences, + CompiledQueryLong compiledQueryIds ) { this.includes = includes; this.excludes = excludes; this.priority = priority; this.coherences = coherences; + this.compiledQueryIds = compiledQueryIds; } - public SearchTerms(SearchSubquery subquery) { + public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) { this(new LongArrayList(), new LongArrayList(), new LongArrayList(), - new ArrayList<>()); + new ArrayList<>(), + compiledQueryIds); - for (var word : subquery.searchTermsInclude) { + for (var word : query.searchTermsInclude) { includes.add(getWordId(word)); } - for (var word : subquery.searchTermsAdvice) { + for (var word : query.searchTermsAdvice) { // This looks like a bug, but it's not includes.add(getWordId(word)); } - for (var coherence : subquery.searchTermCoherences) { + for (var coherence : query.searchTermCoherences) { LongList parts = new LongArrayList(coherence.size()); for (var word : coherence) { @@ -55,10 +61,10 @@ public final class SearchTerms { coherences.add(parts); } - for (var word : subquery.searchTermsExclude) { + for (var word : query.searchTermsExclude) { excludes.add(getWordId(word)); } - for (var word : subquery.searchTermsPriority) { + for (var word : query.searchTermsPriority) { priority.add(getWordId(word)); } } @@ -96,6 +102,8 @@ public final class SearchTerms { return coherences; } + public CompiledQueryLong compiledQuery() { return compiledQueryIds; } + @Override public boolean equals(Object obj) { if (obj == this) return true; diff --git a/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java b/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java index 9797ca95..fa516565 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java +++ b/code/index/java/nu/marginalia/index/model/SearchTermsUtil.java @@ -1,29 +1,9 @@ package nu.marginalia.index.model; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; import nu.marginalia.hash.MurmurHash3_128; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - public class SearchTermsUtil { - /** Extract all include-terms from the specified subqueries, - * and a return a map of the terms and their termIds. - */ - public static Map getAllIncludeTerms(List subqueries) { - Map ret = new HashMap<>(); - - for (var subquery : subqueries) { - for (var include : subquery.searchTermsInclude) { - ret.computeIfAbsent(include, i -> getWordId(include)); - } - } - - return ret; - } - private static final MurmurHash3_128 hasher = new MurmurHash3_128(); /** Translate the word to a unique id. */ diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index 1932a5a4..977a87e7 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -4,7 +4,8 @@ import com.google.inject.Inject; import gnu.trove.map.hash.TObjectLongHashMap; import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; import it.unimi.dsi.fastutil.longs.LongArrayList; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.results.model.QuerySearchTerms; @@ -13,9 +14,6 @@ import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.TermIdList; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup; import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; @@ -42,43 +40,24 @@ public class IndexMetadataService { return new TermMetadataForCombinedDocumentIds(termdocToMeta); } - public QuerySearchTerms getSearchTerms(List searchTermVariants) { + public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { LongArrayList termIdsList = new LongArrayList(); TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); - for (var subquery : searchTermVariants) { - for (var term : subquery.searchTermsInclude) { - if (termToId.containsKey(term)) { - continue; - } - - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termToId.put(term, id); - } + for (String word : compiledQuery) { + long id = SearchTermsUtil.getWordId(word); + termIdsList.add(id); + termToId.put(word, id); } return new QuerySearchTerms(termToId, new TermIdList(termIdsList), - getTermCoherences(searchTermVariants)); - } - - - private TermCoherenceGroupList getTermCoherences(List searchTermVariants) { - List coherences = new ArrayList<>(); - - for (var subquery : searchTermVariants) { - for (var coh : subquery.searchTermCoherences) { - coherences.add(new TermCoherenceGroup(coh)); - } - - // It's assumed each subquery has identical coherences - break; - } - - return new TermCoherenceGroupList(coherences); + new TermCoherenceGroupList( + searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList() + ) + ); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 967a600f..3777cf4f 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,10 +1,13 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.results.model.QuerySearchTerms; @@ -23,7 +26,6 @@ import java.util.List; * reasons to cache this data, and performs the calculations */ public class IndexResultValuationContext { private final StatefulIndex statefulIndex; - private final List> searchTermVariants; private final QueryParams queryParams; private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; @@ -31,23 +33,26 @@ public class IndexResultValuationContext { private final ResultRankingContext rankingContext; private final ResultValuator searchResultValuator; + private final CompiledQuery compiledQuery; + private final CompiledQueryLong compiledQueryIds; public IndexResultValuationContext(IndexMetadataService metadataService, ResultValuator searchResultValuator, CombinedDocIdList ids, StatefulIndex statefulIndex, ResultRankingContext rankingContext, - List subqueries, - QueryParams queryParams + SearchParameters params ) { this.statefulIndex = statefulIndex; this.rankingContext = rankingContext; this.searchResultValuator = searchResultValuator; - this.searchTermVariants = subqueries.stream().map(sq -> sq.searchTermsInclude).distinct().toList(); - this.queryParams = queryParams; + this.queryParams = params.queryParams; + this.compiledQuery = params.compiledQuery; + this.compiledQueryIds = params.compiledQueryIds; + + this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - this.searchTerms = metadataService.getSearchTerms(subqueries); this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); } @@ -65,68 +70,39 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - int maxFlagsCount = 0; - boolean anyAllSynthetic = false; - int maxPositionsSet = 0; + SearchResultItem searchResult = new SearchResultItem(docId); - SearchResultItem searchResult = new SearchResultItem(docId, - searchTermVariants.stream().mapToInt(List::size).sum()); + SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> + new SearchResultKeywordScore( + compiledQuery.at(idx), + compiledQueryIds.at(idx), + termMetadataForCombinedDocumentIds.getTermMetadata( + compiledQueryIds.at(idx), combinedId + ), + docMetadata, + htmlFeatures) + ) + .toArray(SearchResultKeywordScore[]::new); - for (int querySetId = 0; - querySetId < searchTermVariants.size(); - querySetId++) - { - var termList = searchTermVariants.get(querySetId); + // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs + // to be able to re-construct its own CompiledQuery for re-ranking the results. This is + // a very flimsy assumption. + searchResult.keywordScores.addAll(List.of(scores)); - SearchResultKeywordScore[] termScoresForSet = new SearchResultKeywordScore[termList.size()]; + CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); - boolean synthetic = true; + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); - for (int termIdx = 0; termIdx < termList.size(); termIdx++) { - String searchTerm = termList.get(termIdx); - - long termMetadata = termMetadataForCombinedDocumentIds.getTermMetadata( - searchTerms.getIdForTerm(searchTerm), - combinedId - ); - - var score = new SearchResultKeywordScore( - querySetId, - searchTerm, - termMetadata, - docMetadata, - htmlFeatures - ); - - synthetic &= WordFlags.Synthetic.isPresent(termMetadata); - - searchResult.keywordScores.add(score); - - termScoresForSet[termIdx] = score; - } - - if (!meetsQueryStrategyRequirements(termScoresForSet, queryParams.queryStrategy())) { - continue; - } - - int minFlagsCount = 8; - int minPositionsSet = 4; - - for (var termScore : termScoresForSet) { - final int flagCount = Long.bitCount(termScore.encodedWordMetadata() & flagsFilterMask); - minFlagsCount = Math.min(minFlagsCount, flagCount); - minPositionsSet = Math.min(minPositionsSet, termScore.positionCount()); - } - - maxFlagsCount = Math.max(maxFlagsCount, minFlagsCount); - maxPositionsSet = Math.max(maxPositionsSet, minPositionsSet); - anyAllSynthetic |= synthetic; + if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + return null; } - if (maxFlagsCount == 0 && !anyAllSynthetic && maxPositionsSet == 0) + if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(searchResult.keywordScores, + double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -135,20 +111,17 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore[] termSet, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + QueryStrategy queryStrategy) + { if (queryStrategy == QueryStrategy.AUTO || queryStrategy == QueryStrategy.SENTENCE || queryStrategy == QueryStrategy.TOPIC) { return true; } - for (var keyword : termSet) { - if (!meetsQueryStrategyRequirements(keyword, queryParams.queryStrategy())) { - return false; - } - } - - return true; + return CompiledQueryAggregates.booleanAggregate(queryGraphScores, + docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index 51e59c63..f1dabea4 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -4,10 +4,11 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; -import it.unimi.dsi.fastutil.longs.LongArrayList; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; @@ -19,8 +20,6 @@ import org.slf4j.LoggerFactory; import java.sql.SQLException; import java.util.*; -import java.util.function.Consumer; -import java.util.stream.Collectors; @Singleton public class IndexResultValuatorService { @@ -44,8 +43,8 @@ public class IndexResultValuatorService { } public List rankResults(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) + ResultRankingContext rankingContext, + CombinedDocIdList resultIds) { final var evaluator = createValuationContext(params, rankingContext, resultIds); @@ -70,8 +69,7 @@ public class IndexResultValuatorService { resultIds, statefulIndex, rankingContext, - params.subqueries, - params.queryParams); + params); } @@ -96,12 +94,13 @@ public class IndexResultValuatorService { item.resultsFromDomain = domainCountFilter.getCount(item); } - return decorateAndRerank(resultsList, rankingContext); + return decorateAndRerank(resultsList, params.compiledQuery, rankingContext); } /** Decorate the result items with additional information from the link database * and calculate an updated ranking with the additional information */ public List decorateAndRerank(List rawResults, + CompiledQuery compiledQuery, ResultRankingContext rankingContext) throws SQLException { @@ -125,13 +124,22 @@ public class IndexResultValuatorService { continue; } - resultItems.add(createCombinedItem(result, docData, rankingContext)); + // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // + // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same + // order as the data for the CompiledQuery. + CompiledQuery resultQuery = + new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + + + resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, + CompiledQuery resultQuery, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -144,7 +152,7 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - resultValuator.calculateSearchResultValue(result.keywordScores, docData.wordsTotal(), rankingContext) + resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) ); } diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 6c67559d..05ff83d2 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -33,14 +34,17 @@ public class ResultValuator { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(List scores, + public double calculateSearchResultValue(CompiledQuery scores, int length, ResultRankingContext ctx) { - int sets = numberOfSets(scores); + if (scores.size() == 0) + return Double.MAX_VALUE; + if (length < 0) + length = 5000; - long documentMetadata = documentMetadata(scores); - int features = htmlFeatures(scores); + long documentMetadata = scores.at(0).encodedDocMetadata(); + int features = scores.at(0).htmlFeatures(); var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -75,32 +79,16 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = 0; - double bestBM25F = 0; - double bestBM25P = 0; - double bestBM25PN = 0; - - for (int set = 0; set < sets; set++) { - ResultKeywordSet keywordSet = createKeywordSet(scores, set); - - if (keywordSet.isEmpty()) - continue; - - bestTcf = Math.max(bestTcf, rankingParams.tcfWeight * termCoherenceFactor.calculate(keywordSet)); - bestBM25P = Math.max(bestBM25P, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx)); - bestBM25F = Math.max(bestBM25F, rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.fullParams, keywordSet, length, ctx)); - if (keywordSet.hasNgram()) { - bestBM25PN = Math.max(bestBM25PN, rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, keywordSet, ctx)); - } - } - + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); + double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); + double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + 0.25 * bestBM25PN + overallPartPositive, overallPartNegative); + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java index 335b5fa8..bc13671e 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java @@ -1,10 +1,11 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.ranking.results.ResultKeywordSet; public class Bm25Factor { private static final int AVG_LENGTH = 5000; @@ -13,43 +14,33 @@ public class Bm25Factor { * * @see Bm25Parameters */ - public double calculateBm25(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, int length, ResultRankingContext ctx) { + public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { final int docCount = ctx.termFreqDocCount(); - if (length <= 0) - length = AVG_LENGTH; - - double sum = 0.; - - for (var keyword : keywordSet.keywords()) { + return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { double count = keyword.positionCount(); int freq = ctx.frequency(keyword.keyword); - sum += invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - } - - return sum; + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + }); } /** Bm25 calculation, except instead of counting positions in the document, * the number of relevance signals for the term is counted instead. */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, ResultKeywordSet keywordSet, ResultRankingContext ctx) { + public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { final int docCount = ctx.termFreqDocCount(); - double sum = 0.; - - for (var keyword : keywordSet.keywords()) { + return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { double count = evaluatePriorityScore(keyword); int freq = ctx.priorityFrequency(keyword.keyword); // note we override b to zero for priority terms as they are independent of document length - sum += invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - } + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + }); - return sum; } private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index f956ce88..71159c58 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,14 +1,16 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultKeywordSet; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(ResultKeywordSet keywordSet) { - long mask = combinedMask(keywordSet); + public double calculate(CompiledQuery scores) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); return bitsSetFactor(mask); } @@ -19,14 +21,5 @@ public class TermCoherenceFactor { return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25); } - long combinedMask(ResultKeywordSet keywordSet) { - long mask = WordMetadata.POSITIONS_MASK; - - for (var keyword : keywordSet.keywords()) { - mask &= keyword.positions(); - } - - return mask; - } } \ No newline at end of file diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java index 68a88625..74ebdea1 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -2,6 +2,8 @@ package nu.marginalia.index.query; import nu.marginalia.index.query.filter.QueryFilterStepIf; +import java.util.List; + /** Builds a query. *

    * Note: The query builder may omit predicates that are deemed redundant. @@ -21,6 +23,7 @@ public interface IndexQueryBuilder { IndexQueryBuilder notFull(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); + IndexQueryBuilder addInclusionFilterAny(List filterStep); IndexQuery build(); } diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java new file mode 100644 index 00000000..8c20fe98 --- /dev/null +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java @@ -0,0 +1,57 @@ +package nu.marginalia.index.query.filter; + +import nu.marginalia.array.buffer.LongQueryBuffer; + +import java.util.List; +import java.util.StringJoiner; + +public class QueryFilterAllOf implements QueryFilterStepIf { + private final List steps; + + public QueryFilterAllOf(List steps) { + this.steps = steps; + } + + public double cost() { + double prod = 1.; + + for (var step : steps) { + double cost = step.cost(); + if (cost > 1.0) { + prod *= Math.log(cost); + } + else { + prod += cost; + } + } + + return prod; + } + + @Override + public boolean test(long value) { + for (var step : steps) { + if (!step.test(value)) + return false; + } + return true; + } + + + public void apply(LongQueryBuffer buffer) { + if (steps.isEmpty()) + return; + + for (var step : steps) { + step.apply(buffer); + } + } + + public String describe() { + StringJoiner sj = new StringJoiner(",", "[All Of: ", "]"); + for (var step : steps) { + sj.add(step.describe()); + } + return sj.toString(); + } +} diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java index c9ee2c6e..2d177645 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java @@ -2,7 +2,6 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; -import java.util.Arrays; import java.util.List; import java.util.StringJoiner; @@ -14,7 +13,7 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { } public double cost() { - return steps.stream().mapToDouble(QueryFilterStepIf::cost).average().orElse(0.); + return steps.stream().mapToDouble(QueryFilterStepIf::cost).sum(); } @Override @@ -31,31 +30,23 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { if (steps.isEmpty()) return; - int start; + int start = 0; int end = buffer.end; - steps.getFirst().apply(buffer); - - // The filter functions will partition the data in the buffer from 0 to END, - // and update END to the length of the retained items, keeping the retained - // items sorted but making no guarantees about the rejected half - // - // Therefore, we need to re-sort the rejected side, and to satisfy the - // constraint that the data is sorted up to END, finally sort it again. - // - // This sorting may seem like it's slower, but filter.apply(...) is - // typically much faster than iterating over filter.test(...); so this - // is more than made up for - - for (int fi = 1; fi < steps.size(); fi++) + for (var step : steps) { - start = buffer.end; - Arrays.sort(buffer.data, start, end); - buffer.startFilterForRange(start, end); - steps.get(fi).apply(buffer); + var slice = buffer.slice(start, end); + slice.data.quickSort(0, slice.size()); + + step.apply(slice); + start += slice.end; } - Arrays.sort(buffer.data, 0, buffer.end); + buffer.data.quickSort(0, start); + + // Special finalization + buffer.reset(); + buffer.end = start; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java index ed02dd6d..77f503cf 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterLetThrough.java @@ -16,7 +16,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf { } public double cost() { - return 0.; + return 1.; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java index 1bcd04ae..502e7c4c 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterNoPass.java @@ -15,7 +15,7 @@ public class QueryFilterNoPass implements QueryFilterStepIf { } public double cost() { - return 0.; + return 1.; } public String describe() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java index 92c8c972..0d715863 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepExcludeFromPredicate.java @@ -16,7 +16,7 @@ public class QueryFilterStepExcludeFromPredicate implements QueryFilterStepIf { @Override public double cost() { - return 0; + return 1; } @Override diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java index 56f08b71..9cd51d7a 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterStepFromPredicate.java @@ -16,7 +16,7 @@ public class QueryFilterStepFromPredicate implements QueryFilterStepIf { @Override public double cost() { - return 0; + return 1; } @Override diff --git a/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java b/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java index a7450b11..b2ef1bdb 100644 --- a/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java +++ b/code/index/query/test/nu/marginalia/index/query/filter/QueryFilterStepIfTest.java @@ -55,6 +55,32 @@ class QueryFilterStepIfTest { assertArrayEquals(new long[]{8, 10}, buffer.copyData()); } + @Test + public void testSuccessiveApplicationWithAllOf() { + var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0); + var filter2 = new QueryFilterStepExcludeFromPredicate(value -> value <= 6); + new QueryFilterAllOf(List.of(filter1, filter2)).apply(buffer); + assertArrayEquals(new long[]{8, 10}, buffer.copyData()); + } + @Test + public void testCombinedOrAnd() { + var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); + + var filter1 = new QueryFilterStepFromPredicate(value -> value % 2 == 0); + var filter2 = new QueryFilterStepFromPredicate(value -> value <= 5); + var filter1_2 = new QueryFilterAllOf(List.of(filter1, filter2)); + + var filter3 = new QueryFilterStepFromPredicate(value -> value % 2 == 1); + var filter4 = new QueryFilterStepFromPredicate(value -> value > 5); + var filter3_4 = new QueryFilterAllOf(List.of(filter3, filter4)); + + var filter12_34 = new QueryFilterAnyOf(List.of(filter1_2, filter3_4)); + + filter12_34.apply(buffer); + + assertArrayEquals(new long[]{2, 4, 7, 9}, buffer.copyData()); + } @Test public void testCombinedApplication() { var buffer = createBuffer(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 634481f4..301b5e19 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -5,7 +5,7 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.process.control.FakeProcessHeartbeat; @@ -123,9 +123,10 @@ public class IndexQueryServiceIntegrationSmokeTest { .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) .searchSetIdentifier("NONE") - .subqueries(List.of(new SearchSubquery( + .query(new SearchQuery( + "2 3 5", List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + Collections.emptyList())).build()); int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); @@ -166,9 +167,13 @@ public class IndexQueryServiceIntegrationSmokeTest { .rankingParams(ResultRankingParameters.sensibleDefaults()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) - .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + .query(new SearchQuery( + "2 3 5", + List.of("3", "5", "2"), + List.of("4"), + Collections.emptyList(), + Collections.emptyList(), + Collections.emptyList())).build()); int[] idxes = new int[] { 210, 270 }; long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray(); @@ -202,9 +207,8 @@ public class IndexQueryServiceIntegrationSmokeTest { .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier("NONE") .rankingParams(ResultRankingParameters.sensibleDefaults()) - .subqueries(List.of(new SearchSubquery( - List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList())) + .query( + new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()) ).build()); diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 6def5bbc..e29f8751 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -4,7 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import nu.marginalia.IndexLocations; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.storage.FileStorageService; @@ -35,6 +35,7 @@ import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import org.apache.logging.log4j.util.Strings; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -108,7 +109,7 @@ public class IndexQueryServiceIntegrationTest { w("world", WordFlags.Title) ).load(); - var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); executeSearch(query) .expectDocumentsInOrder(d(1,1)); @@ -127,57 +128,51 @@ public class IndexQueryServiceIntegrationTest { ).load(); var queryMissingExclude = basicQuery(builder -> - builder.subqueries(includeAndExclude("hello", "missing"))); + builder.query(includeAndExclude("hello", "missing"))); executeSearch(queryMissingExclude) .expectDocumentsInOrder(d(1,1)); var queryMissingInclude = basicQuery(builder -> - builder.subqueries(justInclude("missing"))); + builder.query(justInclude("missing"))); executeSearch(queryMissingInclude) .expectCount(0); var queryMissingPriority = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of(), - List.of("missing"), - List.of() - ) - ))); + builder.query(new SearchQuery( + "hello", + List.of("hello"), + List.of(), + List.of(), + List.of("missing"), + List.of()) + )); executeSearch(queryMissingPriority) .expectCount(1); var queryMissingAdvice = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of("missing"), - List.of(), - List.of() - ) + builder.query( + new SearchQuery("hello", + List.of("hello"), + List.of(), + List.of("missing"), + List.of(), + List.of() ))); executeSearch(queryMissingAdvice) .expectCount(0); var queryMissingCoherence = basicQuery(builder -> - builder.subqueries( - List.of( - new SearchSubquery( - List.of("hello"), - List.of(), - List.of(), - List.of(), - List.of(List.of("missing", "hello")) - ) + builder.query( + new SearchQuery("hello", + List.of("hello"), + List.of(), + List.of(), + List.of(), + List.of(List.of("missing", "hello")) ))); executeSearch(queryMissingCoherence) @@ -202,7 +197,7 @@ public class IndexQueryServiceIntegrationTest { ).load(); - var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); executeSearch(query) .expectDocumentsInOrder(d(1,1)); @@ -234,15 +229,15 @@ public class IndexQueryServiceIntegrationTest { var beforeY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.lessThan(2000)) ); var atY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.equals(2000)) ); var afterY2K = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .year(SpecificationLimit.greaterThan(2000)) ); @@ -296,11 +291,11 @@ public class IndexQueryServiceIntegrationTest { var domain1 = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .domains(List.of(1)) ); var domain2 = basicQuery(builder -> - builder.subqueries(justInclude("hello", "world")) + builder.query(justInclude("hello", "world")) .domains(List.of(2)) ); @@ -334,7 +329,7 @@ public class IndexQueryServiceIntegrationTest { ).load(); var query = basicQuery(builder -> - builder.subqueries(includeAndExclude("hello", "my_darling")) + builder.query(includeAndExclude("hello", "my_darling")) ); executeSearch(query) @@ -403,7 +398,7 @@ public class IndexQueryServiceIntegrationTest { .load(); var rsp = queryService.justQuery( - basicQuery(builder -> builder.subqueries( + basicQuery(builder -> builder.query( // note coherence requriement includeAndCohere("hello", "world") ))); @@ -424,50 +419,53 @@ public class IndexQueryServiceIntegrationTest { .rank(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) - .searchSetIdentifier("NONE") - .subqueries(List.of()); + .searchSetIdentifier("NONE"); return mutator.apply(builder).build(); } - List justInclude(String... includes) { - return List.of(new SearchSubquery( + SearchQuery justInclude(String... includes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), List.of(includes), List.of(), List.of(), List.of(), List.of() - )); + ); } - List includeAndExclude(List includes, List excludes) { - return List.of(new SearchSubquery( + SearchQuery includeAndExclude(List includes, List excludes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), includes, excludes, List.of(), List.of(), List.of() - )); + ); } - List includeAndExclude(String include, String exclude) { - return List.of(new SearchSubquery( + SearchQuery includeAndExclude(String include, String exclude) { + return new SearchQuery( + include, List.of(include), List.of(exclude), List.of(), List.of(), List.of() - )); + ); } - List includeAndCohere(String... includes) { - return List.of(new SearchSubquery( + SearchQuery includeAndCohere(String... includes) { + return new SearchQuery( + Strings.join(List.of(includes), ' '), List.of(includes), List.of(), List.of(), List.of(), List.of(List.of(includes)) - )); + ); } private MockDataDocument d(int domainId, int ordinal) { return new MockDataDocument(domainId, ordinal); diff --git a/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java new file mode 100644 index 00000000..8d2f45c8 --- /dev/null +++ b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java @@ -0,0 +1,59 @@ +package nu.marginalia.index.index; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongSet; +import org.junit.jupiter.api.Test; + +import java.util.Arrays; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.jupiter.api.Assertions.*; + +class QueryBranchWalkerTest { + @Test + public void testNoOverlap() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2 }, + List.of(set(1), set(2)) + ); + assertEquals(2, paths.size()); + assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + } + + @Test + public void testCond() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2, 3, 4 }, + List.of(set(1,2,3), set(1,4,3)) + ); + assertEquals(1, paths.size()); + assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + System.out.println(Arrays.toString(paths.getFirst().priorityOrder)); + assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder); + + var next = paths.getFirst().next(); + assertEquals(2, next.size()); + assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet())); + Map byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w)); + assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder ); + assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder ); + } + + @Test + public void testNoOverlapFirst() { + var paths = QueryBranchWalker.create( + new long[] { 1, 2, 3 }, + List.of(set(1, 2), set(1, 3)) + ); + assertEquals(1, paths.size()); + assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder); + assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); + } + + LongSet set(long... args) { + return new LongArraySet(args); + } +} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 4f5a12cd..948c5857 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -2,9 +2,10 @@ package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.model.id.UrlIdCodec; -import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.junit.jupiter.api.Assertions.*; class IndexResultDomainDeduplicatorTest { @@ -24,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 4); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 8f8f7eaa..243ae90d 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,5 +1,6 @@ package nu.marginalia.ranking.results; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -35,21 +36,21 @@ class ResultValuatorTest { ); } - List titleOnlyLowCountSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) ); - List highCountNoTitleSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) ); - List highCountSubjectSet = List.of( - new SearchResultKeywordScore(0, "bob", + CompiledQuery highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), 0) @@ -75,7 +76,10 @@ class ResultValuatorTest { System.out.println(highCountSubject); } - private long docMetadata(int topology, int year, int quality, EnumSet flags) { + private long docMetadata(int topology, + int year, + int quality, + EnumSet flags) { return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode(); } diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index a5bca54e..028896d9 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -1,9 +1,10 @@ package nu.marginalia.ranking.results.factors; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultKeywordSet; import org.junit.jupiter.api.Test; import java.util.ArrayList; @@ -20,7 +21,7 @@ class TermCoherenceFactorTest { WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK ); - long mask = termCoherenceFactor.combinedMask(allPositionsSet); + long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); @@ -33,7 +34,7 @@ class TermCoherenceFactorTest { 0, 0 ); - long mask = termCoherenceFactor.combinedMask(allPositionsSet); + long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); @@ -46,7 +47,7 @@ class TermCoherenceFactorTest { List.of(0, 1, 2, 3), List.of(0, 1, 2, 3) ); - long mask = termCoherenceFactor.combinedMask(positions); + long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); printMask(mask); } @@ -57,7 +58,7 @@ class TermCoherenceFactorTest { List.of(55, 54, 53, 52), List.of(55, 54, 53, 52) ); - long mask = termCoherenceFactor.combinedMask(positions); + long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); printMask(mask); } @@ -72,7 +73,7 @@ class TermCoherenceFactorTest { System.out.println(BrailleBlockPunchCards.printBits(mask, 48)); } - ResultKeywordSet createSet(List... maskPositions) { + CompiledQuery createSet(List... maskPositions) { long[] positions = new long[maskPositions.length]; for (int i = 0; i < maskPositions.length; i++) { @@ -84,14 +85,14 @@ class TermCoherenceFactorTest { return createSet(positions); } - ResultKeywordSet createSet(long... positionMasks) { + CompiledQuery createSet(long... positionMasks) { List keywords = new ArrayList<>(); for (int i = 0; i < positionMasks.length; i++) { - keywords.add(new SearchResultKeywordScore(0, "", + keywords.add(new SearchResultKeywordScore("", 0, new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); } - return new ResultKeywordSet(keywords); + return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); } } \ No newline at end of file diff --git a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java index 39d9bff7..ab7f18bd 100644 --- a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java +++ b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java @@ -1,5 +1,7 @@ package nu.marginalia.array.algo; +import nu.marginalia.array.LongArray; + import java.io.IOException; import java.nio.LongBuffer; import java.nio.channels.FileChannel; @@ -61,6 +63,12 @@ public interface LongArrayBase extends BulkTransferArray { } } + default void get(long start, long end, LongArray buffer, int bufferStart) { + for (int i = 0; i < (end-start); i++) { + buffer.set(i + bufferStart, get(start + i)); + } + } + default void get(long start, LongBuffer buffer) { get(start, start + buffer.remaining(), buffer, buffer.position()); } diff --git a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java index 390325ee..d5b44389 100644 --- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -1,5 +1,8 @@ package nu.marginalia.array.buffer; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; + import java.util.Arrays; /** A buffer for long values that can be used to filter and manipulate the data. @@ -17,7 +20,7 @@ import java.util.Arrays; public class LongQueryBuffer { /** Direct access to the data in the buffer, * guaranteed to be populated until `end` */ - public final long[] data; + public final LongArray data; /** Number of items in the data buffer */ public int end; @@ -25,18 +28,27 @@ public class LongQueryBuffer { private int read = 0; private int write = 0; + private LongQueryBuffer(LongArray array, int size) { + this.data = array; + this.end = size; + } + public LongQueryBuffer(int size) { - this.data = new long[size]; + this.data = LongArrayFactory.onHeapConfined(size); this.end = size; } public LongQueryBuffer(long[] data, int size) { - this.data = data; + this.data = LongArrayFactory.onHeapConfined(size); + this.data.set(0, data); + this.end = size; } public long[] copyData() { - return Arrays.copyOf(data, end); + long[] copy = new long[end]; + data.forEach(0, end, (pos, val) -> copy[(int)pos]=val ); + return copy; } public boolean isEmpty() { @@ -48,7 +60,7 @@ public class LongQueryBuffer { } public void reset() { - end = data.length; + end = (int) data.size(); read = 0; write = 0; } @@ -59,12 +71,16 @@ public class LongQueryBuffer { write = 0; } + public LongQueryBuffer slice(int start, int end) { + return new LongQueryBuffer(data.range(start, end), end - start); + } + /* == Filtering methods == */ /** Returns the current value at the read pointer. */ public long currentValue() { - return data[read]; + return data.get(read); } /** Advances the read pointer and returns true if there are more values to read. */ @@ -79,9 +95,9 @@ public class LongQueryBuffer { */ public boolean retainAndAdvance() { if (read != write) { - long tmp = data[write]; - data[write] = data[read]; - data[read] = tmp; + long tmp = data.get(write); + data.set(write, data.get(read)); + data.set(read, tmp); } write++; @@ -117,9 +133,10 @@ public class LongQueryBuffer { write = 0; } - public void startFilterForRange(int pos, int end) { - read = write = pos; - this.end = end; + public void finalizeFiltering(int pos) { + end = write; + read = pos; + write = pos; } /** Retain only unique values in the buffer, and update the end pointer to the new length. @@ -153,7 +170,7 @@ public class LongQueryBuffer { "read = " + read + ",write = " + write + ",end = " + end + - ",data = [" + Arrays.toString(Arrays.copyOf(data, end)) + "]]"; + ",data = [" + Arrays.toString(copyData()) + "]]"; } diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java index a515917b..fa50045e 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySearchTest.java @@ -143,7 +143,7 @@ class LongArraySearchTest { assertEquals(43, buffer.size()); for (int i = 0; i < 43; i++) { - assertEquals(buffer.data[i], i*3); + assertEquals(buffer.data.get(i), i*3); } } @@ -160,7 +160,7 @@ class LongArraySearchTest { int j = 0; for (int i = 0; i < 43; i++) { if (++j % 3 == 0) j++; - assertEquals(buffer.data[i], j); + assertEquals(buffer.data.get(i), j); } } } \ No newline at end of file diff --git a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java index 048e0301..bc40bb43 100644 --- a/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java +++ b/code/libraries/btree/java/nu/marginalia/btree/BTreeReader.java @@ -109,8 +109,8 @@ public class BTreeReader { return ip.findData(key); } - public void readData(long[] buf, int n, long pos) { - data.get(pos, pos + n, buf); + public void readData(LongArray buf, int n, long pos) { + data.get(pos, pos + n, buf, 0); } /** Used for querying interlaced data in the btree. diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java index 8b65753d..be24de10 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java @@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithIndexTest { @Test public void testRetain() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.retainEntries(odds); @@ -46,7 +47,8 @@ public class BTreeReaderRejectRetainWithIndexTest { @Test public void testReject() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.rejectEntries(odds); diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java index e5d4dc79..fc3b71df 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java @@ -32,7 +32,8 @@ public class BTreeReaderRejectRetainWithoutIndexTest { @Test public void testRetain() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); BTreeReader reader = new BTreeReader(array, ctx, 0); reader.retainEntries(odds); @@ -46,7 +47,9 @@ public class BTreeReaderRejectRetainWithoutIndexTest { @Test public void testReject() { LongQueryBuffer odds = new LongQueryBuffer(50); - Arrays.setAll(odds.data, i -> 2L*i + 1); + for (int i = 0; i < 50; i++) + odds.data.set(i, 2L*i + 1); + BTreeReader reader = new BTreeReader(array, ctx, 0); reader.rejectEntries(odds); diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java index 15c8567e..cc28b209 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -1,7 +1,7 @@ package nu.marginalia.search; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; @@ -14,7 +14,7 @@ import java.util.List; public class SearchQueryParamFactory { public QueryParams forRegularSearch(SearchParameters userParams) { - SearchSubquery prototype = new SearchSubquery(); + SearchQuery prototype = new SearchQuery(); var profile = userParams.profile(); profile.addTacitTerms(prototype); diff --git a/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java b/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java index 9e8383f3..ce3bf099 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java +++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchAdtechParameter.java @@ -1,6 +1,6 @@ package nu.marginalia.search.command; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import javax.annotation.Nullable; import java.util.Arrays; @@ -23,7 +23,7 @@ public enum SearchAdtechParameter { return DEFAULT; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); } } diff --git a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java index 6c8634ac..8cf6aada 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java +++ b/code/services-application/search-service/java/nu/marginalia/search/command/SearchJsParameter.java @@ -1,6 +1,6 @@ package nu.marginalia.search.command; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import javax.annotation.Nullable; import java.util.Arrays; @@ -25,7 +25,7 @@ public enum SearchJsParameter { return DEFAULT; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { subquery.searchTermsExclude.addAll(Arrays.asList(implictExcludeSearchTerms)); } } diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java b/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java index 27d9f4aa..955c3fcb 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/SearchProfile.java @@ -2,7 +2,7 @@ package nu.marginalia.search.model; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.api.searchquery.model.query.SearchSubquery; +import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; import java.util.Objects; @@ -47,7 +47,7 @@ public enum SearchProfile { return NO_FILTER; } - public void addTacitTerms(SearchSubquery subquery) { + public void addTacitTerms(SearchQuery subquery) { if (this == ACADEMIA) { subquery.searchTermsAdvice.add("special:academia"); } From e3316a3672b3dc494008121bbf7f4471d4c53387 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Apr 2024 13:30:49 +0200 Subject: [PATCH 20/90] (index) Clean up new index query code --- .../model/compiled/CompiledQueryLong.java | 8 + .../model/compiled/CqDataLong.java | 4 + .../aggregate/CompiledQueryAggregates.java | 1 + .../index/index/QueryBranchWalker.java | 74 ++++++--- .../marginalia/index/index/StatefulIndex.java | 153 +++++++++--------- .../marginalia/index/model/SearchTerms.java | 81 ++-------- .../index/query/filter/QueryFilterAllOf.java | 18 ++- .../index/query/filter/QueryFilterAnyOf.java | 41 ++++- .../array/buffer/LongQueryBuffer.java | 6 - 9 files changed, 208 insertions(+), 178 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java index 639778dc..94fa0e8b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -39,4 +39,12 @@ public class CompiledQueryLong implements Iterable { public Iterator iterator() { return stream().iterator(); } + + public long[] copyData() { + return data.copyData(); + } + + public boolean isEmpty() { + return data.size() == 0; + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java index 8049631e..24f76b13 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataLong.java @@ -24,4 +24,8 @@ public class CqDataLong { public int size() { return data.length; } + + public long[] copyData() { + return Arrays.copyOf(data, data.length); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 209acbee..9c4abe72 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -8,6 +8,7 @@ import java.util.ArrayList; import java.util.List; import java.util.function.*; +/** Contains methods for aggregating across a CompiledQuery or CompiledQueryLong */ public class CompiledQueryAggregates { /** Compiled query aggregate that for a single boolean that treats or-branches as logical OR, * and and-branches as logical AND operations. Will return true if there exists a path through diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java index a465bd86..34b04f0a 100644 --- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -1,13 +1,18 @@ package nu.marginalia.index.index; import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongSet; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.LinkedList; import java.util.List; -class QueryBranchWalker { +/** Helper class for index query construction */ +public class QueryBranchWalker { + private static final Logger logger = LoggerFactory.getLogger(QueryBranchWalker.class); public final long[] priorityOrder; public final List paths; public final long termId; @@ -22,56 +27,81 @@ class QueryBranchWalker { return priorityOrder.length == 0; } + /** Group the provided paths by the lowest termId they contain per the provided priorityOrder, + * into a list of QueryBranchWalkers. This can be performed iteratively on the resultant QBW:s + * to traverse the tree via the next() method. + *

    + * The paths can be extracted through the {@link nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates CompiledQueryAggregates} + * queriesAggregate method. + */ public static List create(long[] priorityOrder, List paths) { + if (paths.isEmpty()) + return List.of(); List ret = new ArrayList<>(); List remainingPaths = new LinkedList<>(paths); - remainingPaths.removeIf(LongSet::isEmpty); + List pathsForPrio = new ArrayList<>(); + for (int i = 0; i < priorityOrder.length; i++) { - long prio = priorityOrder[i]; + long termId = priorityOrder[i]; var it = remainingPaths.iterator(); - List pathsForPrio = new ArrayList<>(); while (it.hasNext()) { var path = it.next(); - if (path.contains(prio)) { - path.remove(prio); + if (path.contains(termId)) { + // Remove the current termId from the path + path.remove(termId); + + // Add it to the set of paths associated with the termId pathsForPrio.add(path); + + // Remove it from consideration it.remove(); } } if (!pathsForPrio.isEmpty()) { - LongArrayList remainingPrios = new LongArrayList(pathsForPrio.size()); - - for (var p : priorityOrder) { - for (var path : pathsForPrio) { - if (path.contains(p)) { - remainingPrios.add(p); - break; - } - } - } - - ret.add(new QueryBranchWalker(remainingPrios.elements(), pathsForPrio, prio)); + long[] newPrios = keepRelevantPriorities(priorityOrder, pathsForPrio); + ret.add(new QueryBranchWalker(newPrios, new ArrayList<>(pathsForPrio), termId)); + pathsForPrio.clear(); } } + // This happens if the priorityOrder array doesn't contain all items in the paths, + // in practice only when an index doesn't contain all the search terms, so we can just + // skip those paths if (!remainingPaths.isEmpty()) { - System.out.println("Dropping: " + remainingPaths); + logger.info("Dropping: {}", remainingPaths); } return ret; } - public List next() { - if (atEnd()) - return List.of(); + /** From the provided priorityOrder array, keep the elements that are present in any set in paths */ + private static long[] keepRelevantPriorities(long[] priorityOrder, List paths) { + LongArrayList remainingPrios = new LongArrayList(paths.size()); + // these sets are typically very small so array set is a good choice + LongSet allElements = new LongArraySet(priorityOrder.length); + for (var path : paths) { + allElements.addAll(path); + } + + for (var p : priorityOrder) { + if (allElements.contains(p)) + remainingPrios.add(p); + } + + return remainingPrios.elements(); + } + + /** Convenience method that applies the create() method + * to the priority order and paths associated with this instance */ + public List next() { return create(priorityOrder, paths); } diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 0f55c0c8..273da2d0 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.index.query.filter.QueryFilterAllOf; import nu.marginalia.index.query.filter.QueryFilterAnyOf; @@ -25,9 +24,7 @@ import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; -import java.util.function.LongFunction; import java.util.function.Predicate; -import java.util.stream.Collectors; /** This class delegates SearchIndexReader and deals with the stateful nature of the index, * i.e. it may be possible to reconstruct the index and load a new set of data. @@ -95,7 +92,6 @@ public class StatefulIndex { logger.error("Uncaught exception", ex); } finally { - lock.unlock(); } @@ -113,62 +109,6 @@ public class StatefulIndex { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } - private Predicate containsOnly(long[] permitted) { - LongSet permittedTerms = new LongOpenHashSet(permitted); - return permittedTerms::containsAll; - } - - private List createBuilders(CompiledQueryLong query, - LongFunction builderFactory, - long[] termPriority) { - List paths = CompiledQueryAggregates.queriesAggregate(query); - - // Remove any paths that do not contain all prioritized terms, as this means - // the term is missing from the index and can never be found - paths.removeIf(containsOnly(termPriority).negate()); - - List helpers = QueryBranchWalker.create(termPriority, paths); - List builders = new ArrayList<>(); - - for (var helper : helpers) { - var builder = builderFactory.apply(helper.termId); - - builders.add(builder); - - if (helper.atEnd()) - continue; - - var filters = helper.next().stream() - .map(this::createFilter) - .toList(); - - builder.addInclusionFilterAny(filters); - } - - return builders; - } - - private QueryFilterStepIf createFilter(QueryBranchWalker helper) { - var selfCondition = combinedIndexReader.hasWordFull(helper.termId); - if (helper.atEnd()) - return selfCondition; - - var nextSteps = helper.next(); - var nextFilters = nextSteps.stream() - .map(this::createFilter) - .map(filter -> new QueryFilterAllOf(List.of(selfCondition, filter))) - .collect(Collectors.toList()); - - if (nextFilters.isEmpty()) - return selfCondition; - - if (nextFilters.size() == 1) - return nextFilters.getFirst(); - - - return new QueryFilterAnyOf(nextFilters); - } - public List createQueries(SearchTerms terms, QueryParams params) { if (!isLoaded()) { @@ -176,29 +116,99 @@ public class StatefulIndex { return Collections.emptyList(); } - final long[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords); - final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio); - List queryHeads = new ArrayList<>(10); - queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findFullWord, orderedIncludes)); - queryHeads.addAll(createBuilders(terms.compiledQuery(), combinedIndexReader::findPriorityWord, orderedIncludesPrio)); + final long[] termPriority = terms.sortedDistinctIncludes(this::compareKeywords); + List paths = CompiledQueryAggregates.queriesAggregate(terms.compiledQuery()); - List queries = new ArrayList<>(10); + // Remove any paths that do not contain all prioritized terms, as this means + // the term is missing from the index and can never be found + paths.removeIf(containsAll(termPriority).negate()); + List helpers = QueryBranchWalker.create(termPriority, paths); + + for (var helper : helpers) { + for (var builder : List.of( + combinedIndexReader.findPriorityWord(helper.termId), + combinedIndexReader.findFullWord(helper.termId) + )) + { + queryHeads.add(builder); + + if (helper.atEnd()) + continue; + + List filterSteps = new ArrayList<>(); + for (var step : helper.next()) { + filterSteps.add(createFilter(step, 0)); + } + builder.addInclusionFilterAny(filterSteps); + } + } + + List ret = new ArrayList<>(10); + + // Add additional conditions to the query heads for (var query : queryHeads) { + // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing + for (long term : terms.advice()) { + query = query.alsoFull(term); + } + for (long term : terms.excludes()) { query = query.notFull(term); } // Run these filter steps last, as they'll worst-case cause as many page faults as there are // items in the buffer - queries.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); + ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); } - return queries; + return ret; + } + + /** Recursively create a filter step based on the QBW and its children */ + private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) { + final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth); + + var childSteps = walker.next(); + + if (childSteps.isEmpty()) + return ownFilterCondition; + + List combinedFilters = new ArrayList<>(); + + for (var step : childSteps) { + // Recursion will be limited to a fairly shallow stack depth due to how the queries are constructed. + var childFilter = createFilter(step, depth+1); + combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter)); + } + + if (combinedFilters.size() == 1) + return combinedFilters.getFirst(); + else + return new QueryFilterAnyOf(combinedFilters); + } + + /** Create a filter condition based on the termId associated with the QBW */ + private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) { + if (depth < 2) { + // At shallow depths we prioritize terms that appear in the priority index, + // to increase the odds we find "good" results before the sand runs out + return new QueryFilterAnyOf( + combinedIndexReader.hasWordPrio(walker.termId), + combinedIndexReader.hasWordFull(walker.termId) + ); + } else { + return combinedIndexReader.hasWordFull(walker.termId); + } + } + + private Predicate containsAll(long[] permitted) { + LongSet permittedTerms = new LongOpenHashSet(permitted); + return permittedTerms::containsAll; } private int compareKeywords(long a, long b) { @@ -208,13 +218,6 @@ public class StatefulIndex { ); } - private int compareKeywordsPrio(long a, long b) { - return Long.compare( - combinedIndexReader.numHitsPrio(a), - combinedIndexReader.numHitsPrio(b) - ); - } - /** Return an array of encoded document metadata longs corresponding to the * document identifiers provided; with metadata for termId. The input array * docs[] *must* be sorted. diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index 307e4179..8115c109 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -3,54 +3,35 @@ package nu.marginalia.index.model; import it.unimi.dsi.fastutil.longs.LongArrayList; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.query.SearchQuery; import java.util.ArrayList; import java.util.List; -import java.util.Objects; import static nu.marginalia.index.model.SearchTermsUtil.getWordId; public final class SearchTerms { - private final LongList includes; + private final LongList advice; private final LongList excludes; private final LongList priority; private final List coherences; private final CompiledQueryLong compiledQueryIds; - public SearchTerms( - LongList includes, - LongList excludes, - LongList priority, - List coherences, - CompiledQueryLong compiledQueryIds - ) { - this.includes = includes; - this.excludes = excludes; - this.priority = priority; - this.coherences = coherences; + public SearchTerms(SearchQuery query, + CompiledQueryLong compiledQueryIds) + { + this.excludes = new LongArrayList(); + this.priority = new LongArrayList(); + this.coherences = new ArrayList<>(); + this.advice = new LongArrayList(); this.compiledQueryIds = compiledQueryIds; - } - public SearchTerms(SearchQuery query, CompiledQueryLong compiledQueryIds) { - this(new LongArrayList(), - new LongArrayList(), - new LongArrayList(), - new ArrayList<>(), - compiledQueryIds); - - for (var word : query.searchTermsInclude) { - includes.add(getWordId(word)); - } for (var word : query.searchTermsAdvice) { - // This looks like a bug, but it's not - includes.add(getWordId(word)); + advice.add(getWordId(word)); } - for (var coherence : query.searchTermCoherences) { LongList parts = new LongArrayList(coherence.size()); @@ -64,36 +45,29 @@ public final class SearchTerms { for (var word : query.searchTermsExclude) { excludes.add(getWordId(word)); } + for (var word : query.searchTermsPriority) { priority.add(getWordId(word)); } } public boolean isEmpty() { - return includes.isEmpty(); + return compiledQueryIds.isEmpty(); } public long[] sortedDistinctIncludes(LongComparator comparator) { - if (includes.isEmpty()) - return includes.toLongArray(); - - LongList list = new LongArrayList(new LongOpenHashSet(includes)); + LongList list = new LongArrayList(compiledQueryIds.copyData()); list.sort(comparator); return list.toLongArray(); } - public int size() { - return includes.size() + excludes.size() + priority.size(); - } - - public LongList includes() { - return includes; - } public LongList excludes() { return excludes; } - + public LongList advice() { + return advice; + } public LongList priority() { return priority; } @@ -104,29 +78,4 @@ public final class SearchTerms { public CompiledQueryLong compiledQuery() { return compiledQueryIds; } - @Override - public boolean equals(Object obj) { - if (obj == this) return true; - if (obj == null || obj.getClass() != this.getClass()) return false; - var that = (SearchTerms) obj; - return Objects.equals(this.includes, that.includes) && - Objects.equals(this.excludes, that.excludes) && - Objects.equals(this.priority, that.priority) && - Objects.equals(this.coherences, that.coherences); - } - - @Override - public int hashCode() { - return Objects.hash(includes, excludes, priority, coherences); - } - - @Override - public String toString() { - return "SearchTerms[" + - "includes=" + includes + ", " + - "excludes=" + excludes + ", " + - "priority=" + priority + ", " + - "coherences=" + coherences + ']'; - } - } diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java index 8c20fe98..e9725179 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAllOf.java @@ -2,14 +2,28 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; +import java.util.ArrayList; import java.util.List; import java.util.StringJoiner; public class QueryFilterAllOf implements QueryFilterStepIf { - private final List steps; + private final List steps; public QueryFilterAllOf(List steps) { - this.steps = steps; + this.steps = new ArrayList<>(steps.size()); + + for (var step : steps) { + if (step instanceof QueryFilterAllOf allOf) { + this.steps.addAll(allOf.steps); + } + else { + this.steps.add(step); + } + } + } + + public QueryFilterAllOf(QueryFilterStepIf... steps) { + this(List.of(steps)); } public double cost() { diff --git a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java index 2d177645..bea62194 100644 --- a/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java +++ b/code/index/query/java/nu/marginalia/index/query/filter/QueryFilterAnyOf.java @@ -2,14 +2,27 @@ package nu.marginalia.index.query.filter; import nu.marginalia.array.buffer.LongQueryBuffer; +import java.util.ArrayList; import java.util.List; import java.util.StringJoiner; public class QueryFilterAnyOf implements QueryFilterStepIf { - private final List steps; + private final List steps; public QueryFilterAnyOf(List steps) { - this.steps = steps; + this.steps = new ArrayList<>(steps.size()); + + for (var step : steps) { + if (step instanceof QueryFilterAnyOf anyOf) { + this.steps.addAll(anyOf.steps); + } else { + this.steps.add(step); + } + } + } + + public QueryFilterAnyOf(QueryFilterStepIf... steps) { + this(List.of(steps)); } public double cost() { @@ -30,23 +43,37 @@ public class QueryFilterAnyOf implements QueryFilterStepIf { if (steps.isEmpty()) return; + if (steps.size() == 1) { + steps.getFirst().apply(buffer); + return; + } + int start = 0; - int end = buffer.end; + final int endOfValidData = buffer.end; // End of valid data range + + // The filters act as a partitioning function, where anything before buffer.end + // is "in", and is guaranteed to be sorted; and anything after buffer.end is "out" + // but no sorting guaranteed is provided. + + // To provide a conditional filter, we re-sort the "out" range, slice it and apply filtering to the slice for (var step : steps) { - var slice = buffer.slice(start, end); + var slice = buffer.slice(start, endOfValidData); slice.data.quickSort(0, slice.size()); step.apply(slice); start += slice.end; } - buffer.data.quickSort(0, start); - - // Special finalization + // After we're done, read and write pointers should be 0 and "end" should be the length of valid data, + // normally done through buffer.finalizeFiltering(); but that won't work here buffer.reset(); buffer.end = start; + + // After all filters have been applied, we must re-sort all the retained data + // to uphold the sortedness contract + buffer.data.quickSort(0, buffer.end); } public String describe() { diff --git a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java index d5b44389..a0312d36 100644 --- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -133,12 +133,6 @@ public class LongQueryBuffer { write = 0; } - public void finalizeFiltering(int pos) { - end = write; - read = pos; - write = pos; - } - /** Retain only unique values in the buffer, and update the end pointer to the new length. *

    * The buffer is assumed to be sorted up until the end pointer. From adc90c8f1ecb6945963dd63b4a1d119895123ece Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 5 Apr 2024 18:52:58 +0200 Subject: [PATCH 21/90] (sentence-extractor) Fix resource leak in sentence extractor The code would always re-initialize the static ngramLexicon and rdrposTagger fields with new instances even if they were already instantiated, leading to a ton of unnecessary RAM allocation. The modified behavior checks for nullity before creating a new instance. --- .../language/sentence/SentenceExtractor.java | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index fd15660f..bb1e3771 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -60,13 +60,16 @@ public class SentenceExtractor { } synchronized (this) { - ngramLexicon = new NgramLexicon(models); - - try { - rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); + if (ngramLexicon == null) { + ngramLexicon = new NgramLexicon(models); } - catch (Exception ex) { - throw new IllegalStateException(ex); + + if (rdrposTagger == null) { + try { + rdrposTagger = new RDRPOSTagger(models.posDict, models.posRules); + } catch (Exception ex) { + throw new IllegalStateException(ex); + } } } From 7e216db463ebfb7167478b63c1b17e2a627abe83 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 6 Apr 2024 13:28:14 +0200 Subject: [PATCH 22/90] (index) Add origin trace information for index readers This used to be supported by the system but got lost in refactoring at some point. --- .../nu/marginalia/index/ReverseIndexEntrySource.java | 7 +++++-- .../java/nu/marginalia/index/ReverseIndexReader.java | 9 ++++++--- .../test/nu/marginalia/index/ReverseIndexReaderTest.java | 2 +- code/index/java/nu/marginalia/index/IndexFactory.java | 4 ++-- 4 files changed, 14 insertions(+), 8 deletions(-) diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java index 7c12563b..851bf9ab 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java @@ -7,6 +7,7 @@ import nu.marginalia.index.query.EntrySource; import static java.lang.Math.min; public class ReverseIndexEntrySource implements EntrySource { + private final String name; private final BTreeReader reader; int pos; @@ -15,9 +16,11 @@ public class ReverseIndexEntrySource implements EntrySource { final int entrySize; private final long wordId; - public ReverseIndexEntrySource(BTreeReader reader, + public ReverseIndexEntrySource(String name, + BTreeReader reader, int entrySize, long wordId) { + this.name = name; this.reader = reader; this.entrySize = entrySize; this.wordId = wordId; @@ -60,6 +63,6 @@ public class ReverseIndexEntrySource implements EntrySource { @Override public String indexName() { - return "Full:" + Long.toHexString(wordId); + return name + ":" + Long.toHexString(wordId); } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java index f37420dd..e37de80d 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java @@ -25,8 +25,11 @@ public class ReverseIndexReader { private final long wordsDataOffset; private final Logger logger = LoggerFactory.getLogger(getClass()); private final BTreeReader wordsBTreeReader; + private final String name; + + public ReverseIndexReader(String name, Path words, Path documents) throws IOException { + this.name = name; - public ReverseIndexReader(Path words, Path documents) throws IOException { if (!Files.exists(words) || !Files.exists(documents)) { this.words = null; this.documents = null; @@ -84,7 +87,7 @@ public class ReverseIndexReader { if (offset < 0) return new EmptyEntrySource(); - return new ReverseIndexEntrySource(createReaderNew(offset), 2, wordId); + return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId); } public QueryFilterStepIf also(long wordId) { @@ -92,7 +95,7 @@ public class ReverseIndexReader { if (offset < 0) return new QueryFilterNoPass(); - return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId); + return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId); } public QueryFilterStepIf not(long wordId) { diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java index e6b76249..ed8b4193 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java @@ -102,7 +102,7 @@ class ReverseIndexReaderTest { preindex.finalizeIndex(docsFile, wordsFile); preindex.delete(); - return new ReverseIndexReader(wordsFile, docsFile); + return new ReverseIndexReader("test", wordsFile, docsFile); } } \ No newline at end of file diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java index 48911546..a1d2f5a5 100644 --- a/code/index/java/nu/marginalia/index/IndexFactory.java +++ b/code/index/java/nu/marginalia/index/IndexFactory.java @@ -41,14 +41,14 @@ public class IndexFactory { public ReverseIndexReader getReverseIndexReader() throws IOException { - return new ReverseIndexReader( + return new ReverseIndexReader("full", ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT) ); } public ReverseIndexReader getReverseIndexPrioReader() throws IOException { - return new ReverseIndexReader( + return new ReverseIndexReader("prio", ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT) ); From 6cba6aef3bbd2aed9da13a87cd25929c9b4cca42 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 6 Apr 2024 14:34:15 +0200 Subject: [PATCH 23/90] (minor) Remove dead code --- .../ranking/results/ResultKeywordSet.java | 26 ----------- .../ranking/results/ResultValuator.java | 45 ------------------- .../search/model/ClusteredUrlDetails.java | 2 +- 3 files changed, 1 insertion(+), 72 deletions(-) delete mode 100644 code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java diff --git a/code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java b/code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java deleted file mode 100644 index 19405dcb..00000000 --- a/code/index/java/nu/marginalia/ranking/results/ResultKeywordSet.java +++ /dev/null @@ -1,26 +0,0 @@ -package nu.marginalia.ranking.results; - - -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; - -import java.util.List; - -public record ResultKeywordSet(List keywords) { - - public int length() { - return keywords.size(); - } - public boolean isEmpty() { return length() == 0; } - public boolean hasNgram() { - for (var word : keywords) { - if (word.keyword.contains("_")) { - return true; - } - } - return false; - } - @Override - public String toString() { - return "%s[%s]".formatted(getClass().getSimpleName(), keywords); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 05ff83d2..862978c9 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -147,51 +147,6 @@ public class ResultValuator { return (int) -penalty; } - private long documentMetadata(List rawScores) { - for (var score : rawScores) { - return score.encodedDocMetadata(); - } - return 0; - } - - private int htmlFeatures(List rawScores) { - for (var score : rawScores) { - return score.htmlFeatures(); - } - return 0; - } - - private ResultKeywordSet createKeywordSet(List rawScores, - int thisSet) - { - List scoresList = new ArrayList<>(); - - for (var score : rawScores) { - if (score.subquery != thisSet) - continue; - - // Don't consider synthetic keywords for ranking, these are keywords that don't - // have counts. E.g. "tld:edu" - if (score.isKeywordSpecial()) - continue; - - scoresList.add(score); - } - - return new ResultKeywordSet(scoresList); - - } - - private int numberOfSets(List scores) { - int maxSet = 0; - - for (var score : scores) { - maxSet = Math.max(maxSet, score.subquery); - } - - return 1 + maxSet; - } - public static double normalize(double value, double penalty) { if (value < 0) value = 0; diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index 6dd7390d..6abe7cd1 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -46,7 +46,7 @@ public class ClusteredUrlDetails implements Comparable { return urlDetails.resultItem.keywordScores.stream() .filter(score -> !score.keyword.contains(":")) .collect(Collectors.toMap( - score -> score.subquery, + score -> -1, // FIXME score -> score.hasTermFlag(WordFlags.Title) | score.hasTermFlag(WordFlags.ExternalLink) | score.hasTermFlag(WordFlags.UrlDomain) From 4fb86ac6927130a42e08ff2cb19a76c19558d9da Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 7 Apr 2024 11:24:30 +0200 Subject: [PATCH 24/90] (search) Fix outdated assumptions about the results We no longer break the query into "sets" of search terms and need to adapt the code to not use this assumption. For the API service, we'll simulate the old behavior to keep the API stable. For the search service, we'll introduce a new way of calculating positions through tree aggregation. --- .../api/searchquery/QueryProtobufCodec.java | 2 + .../aggregate/CompiledQueryAggregates.java | 5 ++ .../aggregate/CqPositionsOperator.java | 79 +++++++++++++++++++ .../results/DecoratedSearchResultItem.java | 3 + .../results/SearchResultKeywordScore.java | 12 +-- .../api/src/main/protobuf/query-api.proto | 1 + .../nu/marginalia/index/IndexGrpcService.java | 1 + .../results/IndexResultValuatorService.java | 21 +++++ .../nu/marginalia/api/ApiSearchOperator.java | 26 +++--- .../search/model/ClusteredUrlDetails.java | 63 +++++++-------- .../search/svc/SearchQueryIndexService.java | 25 +----- 11 files changed, 165 insertions(+), 73 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index f0113870..b705917e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -121,6 +121,7 @@ public class QueryProtobufCodec { results.getPubYear(), // ??, results.getDataHash(), results.getWordsTotal(), + results.getBestPositions(), results.getRankingScore() ); } @@ -202,6 +203,7 @@ public class QueryProtobufCodec { rpcDecoratedResultItem.getPubYear(), rpcDecoratedResultItem.getDataHash(), rpcDecoratedResultItem.getWordsTotal(), + rpcDecoratedResultItem.getBestPositions(), rpcDecoratedResultItem.getRankingScore() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 9c4abe72..0ab0647d 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -44,4 +44,9 @@ public class CompiledQueryAggregates { public static List queriesAggregate(CompiledQueryLong query) { return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); } + + /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ + public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { + return query.root().visit(new CqPositionsOperator(query, operator)); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java new file mode 100644 index 00000000..19db2d4b --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java @@ -0,0 +1,79 @@ +package nu.marginalia.api.searchquery.model.compiled.aggregate; + +import it.unimi.dsi.fastutil.longs.LongArraySet; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import it.unimi.dsi.fastutil.longs.LongSet; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; + +import java.util.List; +import java.util.function.IntToLongFunction; +import java.util.function.ToLongFunction; + +public class CqPositionsOperator implements CqExpression.ObjectVisitor { + private final IntToLongFunction operator; + + public CqPositionsOperator(CompiledQuery query, ToLongFunction operator) { + this.operator = idx -> operator.applyAsLong(query.at(idx)); + } + + @Override + public LongSet onAnd(List parts) { + LongSet ret = new LongArraySet(); + + for (var part : parts) { + ret = comineSets(ret, part.visit(this)); + } + + return ret; + } + + private LongSet comineSets(LongSet a, LongSet b) { + if (a.isEmpty()) + return b; + if (b.isEmpty()) + return a; + + LongSet ret = newSet(a.size() * b.size()); + + var ai = a.longIterator(); + + while (ai.hasNext()) { + long aval = ai.nextLong(); + + var bi = b.longIterator(); + while (bi.hasNext()) { + ret.add(aval & bi.nextLong()); + } + } + + return ret; + } + + @Override + public LongSet onOr(List parts) { + LongSet ret = newSet(parts.size()); + + for (var part : parts) { + ret.addAll(part.visit(this)); + } + + return ret; + } + + @Override + public LongSet onLeaf(int idx) { + var set = newSet(1); + set.add(operator.applyAsLong(idx)); + return set; + } + + /** Allocate a new set suitable for a collection with the provided cardinality */ + private LongSet newSet(int cardinality) { + if (cardinality < 8) + return new LongArraySet(cardinality); + else + return new LongOpenHashSet(cardinality); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java index b099dc01..df48ea64 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java @@ -30,6 +30,7 @@ public class DecoratedSearchResultItem implements Comparable next.termScore <= scoreLimit) - .toList(); - } + this.rest.removeIf(urlDetail -> { + if (urlDetail.termScore > scoreLimit) + return false; + + for (var keywordScore : urlDetail.resultItem.keywordScores) { + if (keywordScore.isKeywordSpecial()) + continue; + if (keywordScore.positionCount() == 0) + continue; + + if (keywordScore.hasTermFlag(WordFlags.Title)) + return false; + if (keywordScore.hasTermFlag(WordFlags.ExternalLink)) + return false; + if (keywordScore.hasTermFlag(WordFlags.UrlDomain)) + return false; + if (keywordScore.hasTermFlag(WordFlags.UrlPath)) + return false; + if (keywordScore.hasTermFlag(WordFlags.Subjects)) + return false; + } + + return true; + }); } - private boolean isEligbleForInclusion(UrlDetails urlDetails) { - return urlDetails.resultItem.keywordScores.stream() - .filter(score -> !score.keyword.contains(":")) - .collect(Collectors.toMap( - score -> -1, // FIXME - score -> score.hasTermFlag(WordFlags.Title) - | score.hasTermFlag(WordFlags.ExternalLink) - | score.hasTermFlag(WordFlags.UrlDomain) - | score.hasTermFlag(WordFlags.UrlPath) - | score.hasTermFlag(WordFlags.Subjects) - , - (a, b) -> a && b - )) - .containsValue(Boolean.TRUE); - } public ClusteredUrlDetails(@NotNull UrlDetails onlyFirst) { this.first = onlyFirst; diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 785c8952..6dc7b83b 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -88,7 +88,7 @@ public class SearchQueryIndexService { DomainIndexingState.ACTIVE, detail.rankingScore, // termScore detail.resultsFromDomain(), - getPositionsString(detail.rawIndexResult), + getPositionsString(detail), detail.rawIndexResult, detail.rawIndexResult.keywordScores )); @@ -97,27 +97,8 @@ public class SearchQueryIndexService { return ret; } - private String getPositionsString(SearchResultItem resultItem) { - Int2LongArrayMap positionsPerSet = new Int2LongArrayMap(8); - - for (var score : resultItem.keywordScores) { - if (!score.isKeywordRegular()) { - continue; - } - positionsPerSet.merge(score.subquery(), score.positions(), this::and); - } - - long bits = positionsPerSet.values().longStream().reduce(this::or).orElse(0); - - return BrailleBlockPunchCards.printBits(bits, 56); + private String getPositionsString(DecoratedSearchResultItem resultItem) { + return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56); } - - private long and(long a, long b) { - return a & b; - } - private long or(long a, long b) { - return a | b; - } - } From 491d6bec46a9b1a4b48d5f6e7add91dd935c97d3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 10 Apr 2024 16:58:05 +0200 Subject: [PATCH 25/90] (term-freq-exporter) Extract ngrams in term-frequency-exporter --- .../java/nu/marginalia/extractor/TermFrequencyExporter.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index df1e56a9..bdb7362a 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -127,6 +127,10 @@ public class TermFrequencyExporter implements ExporterIf { for (var word : sent) { words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } + + for (var ngram : sent.ngramStemmed) { + words.add(longHash(ngram.getBytes())); + } } synchronized (counts) { From 6bfe04b609726bac327ad99795ce7a6b5533d44a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 10 Apr 2024 17:11:23 +0200 Subject: [PATCH 26/90] (term-freq-exporter) Reduce thread count and memory usage --- .../data-extractors/build.gradle | 1 + .../extractor/TermFrequencyExporter.java | 27 +++++++++---------- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle index 73aebd49..69ae1388 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/features-convert/data-extractors/build.gradle @@ -21,6 +21,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') + implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:features-crawl:link-parser') implementation project(':code:features-convert:anchor-keywords') implementation project(':code:process-models:crawling-model') diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index bdb7362a..1e1a2cd5 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -14,6 +14,7 @@ import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.util.SimpleBlockingThreadPool; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; @@ -53,27 +54,23 @@ public class TermFrequencyExporter implements ExporterIf { TLongIntHashMap counts = new TLongIntHashMap(100_000_000, 0.7f, -1, -1); AtomicInteger docCount = new AtomicInteger(); - try (ForkJoinPool fjp = new ForkJoinPool(Math.max(2, Runtime.getRuntime().availableProcessors() / 2))) { + SimpleBlockingThreadPool sjp = new SimpleBlockingThreadPool("exporter", Math.clamp(2, 16, Runtime.getRuntime().availableProcessors() / 2), 4); + Path crawlerLogFile = inputDir.resolve("crawler.log"); - Path crawlerLogFile = inputDir.resolve("crawler.log"); + for (var item : WorkLog.iterable(crawlerLogFile)) { + if (Thread.interrupted()) { + sjp.shutDownNow(); - for (var item : WorkLog.iterable(crawlerLogFile)) { - if (Thread.interrupted()) { - fjp.shutdownNow(); - - throw new InterruptedException(); - } - - Path crawlDataPath = inputDir.resolve(item.relPath()); - fjp.execute(() -> processFile(crawlDataPath, counts, docCount, se.get())); + throw new InterruptedException(); } - while (!fjp.isQuiescent()) { - if (fjp.awaitQuiescence(10, TimeUnit.SECONDS)) - break; - } + Path crawlDataPath = inputDir.resolve(item.relPath()); + sjp.submitQuietly(() -> processFile(crawlDataPath, counts, docCount, se.get())); } + sjp.shutDown(); + sjp.awaitTermination(10, TimeUnit.DAYS); + var tmpFile = Files.createTempFile(destStorage.asPath(), "freqs", ".dat.tmp", PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); From 8bf7d090fd2aac4edbe41b697d37390916b726b6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 17:20:13 +0200 Subject: [PATCH 27/90] (qs) Clean up parsing code using new record matching --- .../searchquery/model/query/SearchQuery.java | 2 +- .../query_parser/ExpansionStrategy.java | 7 - .../query_parser/QueryExpansion.java | 10 + .../searchquery/query_parser/QueryParser.java | 158 +++++++++------ .../query_parser/QueryTokenizer.java | 31 +-- .../query_parser/token/QueryToken.java | 86 ++++++++ .../searchquery/query_parser/token/Token.java | 49 ----- .../query_parser/token/TokenType.java | 34 ---- .../query_parser/token/TokenVisitor.java | 14 -- .../searchquery/svc/QueryFactory.java | 183 +++++++++++++----- .../svc/QueryLimitsAccumulator.java | 93 --------- .../svc/QuerySearchTermsAccumulator.java | 105 ---------- .../util/transform_list/TransformList.java | 9 + 13 files changed, 349 insertions(+), 432 deletions(-) delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java create mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java delete mode 100644 code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index 9dd10396..ffe02868 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -72,7 +72,7 @@ public class SearchQuery { @Override public String toString() { StringBuilder sb = new StringBuilder(); - if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery); + if (!compiledQuery.isEmpty()) sb.append("compiledQuery=").append(compiledQuery).append(", "); if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java deleted file mode 100644 index 20ebffd1..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/ExpansionStrategy.java +++ /dev/null @@ -1,7 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; - -public interface ExpansionStrategy { - void expand(QWordGraph graph); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 6415751b..052516d8 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -15,6 +15,9 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; +/** Responsible for expanding a query, that is creating alternative branches of query execution + * to increase the number of results + */ public class QueryExpansion { private static final PorterStemmer ps = new PorterStemmer(); private final TermFrequencyDict dict; @@ -94,6 +97,10 @@ public class QueryExpansion { } } + /** Create an alternative interpretation of the query that replaces a sequence of words + * with a word n-gram. This makes it so that when possible, the order of words in the document + * matches the order of the words in the query. + */ public void createSegments(QWordGraph graph) { List nodes = new ArrayList<>(); @@ -115,4 +122,7 @@ public class QueryExpansion { } } + public interface ExpansionStrategy { + void expand(QWordGraph graph); + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index bbaf5c87..3f92a594 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -1,8 +1,7 @@ package nu.marginalia.functions.searchquery.query_parser; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.WordPatterns; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; import nu.marginalia.util.transform_list.TransformList; import java.util.List; @@ -11,95 +10,126 @@ public class QueryParser { private final QueryTokenizer tokenizer = new QueryTokenizer(); - public List parse(String query) { - List basicTokens = tokenizer.tokenizeQuery(query); + public List parse(String query) { + List basicTokens = tokenizer.tokenizeQuery(query); - TransformList list = new TransformList<>(basicTokens); + TransformList list = new TransformList<>(basicTokens); list.transformEach(QueryParser::handleQuoteTokens); list.transformEach(QueryParser::trimLiterals); list.transformEachPair(QueryParser::createNegatedTerms); list.transformEachPair(QueryParser::createPriorityTerms); list.transformEach(QueryParser::handleSpecialOperations); - list.scanAndTransform(TokenType.LPAREN, TokenType.RPAREN, QueryParser::handleAdvisoryTerms); + list.scanAndTransform(QueryToken.LParen.class::isInstance, QueryToken.RParen.class::isInstance, QueryParser::handleAdvisoryTerms); + list.transformEach(QueryParser::normalizeDomainName); return list.getBackingList(); } - private static void handleQuoteTokens(TransformList.Entity entity) { - var t = entity.value(); - if (t.type == TokenType.QUOT) { - entity.replace(new Token(TokenType.QUOT_TERM, - t.str.replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), - t.displayStr)); - } - } - - private static void trimLiterals(TransformList.Entity entity) { + private static void normalizeDomainName(TransformList.Entity entity) { var t = entity.value(); - if (t.type == TokenType.LITERAL_TERM - && (t.str.endsWith(":") || t.str.endsWith(".")) - && t.str.length() > 1) { - entity.replace(new Token(TokenType.LITERAL_TERM, t.str.substring(0, t.str.length() - 1), t.displayStr)); + if (!(t instanceof QueryToken.LiteralTerm)) + return; + + if (t.str().startsWith("site:")) { + entity.replace(new QueryToken.LiteralTerm(t.str().toLowerCase(), t.displayStr())); } } - private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value(); - var tn = second.value(); - - if (t.type == TokenType.MINUS && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.EXCLUDE_TERM, tn.str, "-" + tn.str)); - } - } - - private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { - var t = first.value(); - var tn = second.value(); - - if (t.type == TokenType.QMARK && tn.type == TokenType.LITERAL_TERM) { - first.remove(); - second.replace(new Token(TokenType.PRIORTY_TERM, tn.str, "?" + tn.str)); - } - } - - private static void handleSpecialOperations(TransformList.Entity entity) { + private static void handleQuoteTokens(TransformList.Entity entity) { var t = entity.value(); - if (t.type != TokenType.LITERAL_TERM) { + + if (!(t instanceof QueryToken.Quot)) { return; } - if (t.str.startsWith("q") && t.str.matches("q[=><]\\d+")) { - entity.replace(new Token(TokenType.QUALITY_TERM, t.str.substring(1), t.displayStr)); - } else if (t.str.startsWith("near:")) { - entity.replace(new Token(TokenType.NEAR_TERM, t.str.substring(5), t.displayStr)); - } else if (t.str.startsWith("year") && t.str.matches("year[=><]\\d{4}")) { - entity.replace(new Token(TokenType.YEAR_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("size") && t.str.matches("size[=><]\\d+")) { - entity.replace(new Token(TokenType.SIZE_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("rank") && t.str.matches("rank[=><]\\d+")) { - entity.replace(new Token(TokenType.RANK_TERM, t.str.substring(4), t.displayStr)); - } else if (t.str.startsWith("qs=")) { - entity.replace(new Token(TokenType.QS_TERM, t.str.substring(3), t.displayStr)); - } else if (t.str.contains(":")) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, t.displayStr)); - } + entity.replace(new QueryToken.QuotTerm( + t.str().replaceAll("\\s+", WordPatterns.WORD_TOKEN_JOINER), + t.displayStr())); } - private static void handleAdvisoryTerms(TransformList.Entity entity) { + private static void trimLiterals(TransformList.Entity entity) { var t = entity.value(); - if (t.type == TokenType.LPAREN) { - entity.remove(); - } else if (t.type == TokenType.RPAREN) { - entity.remove(); - } else if (t.type == TokenType.LITERAL_TERM) { - entity.replace(new Token(TokenType.ADVICE_TERM, t.str, "(" + t.str + ")")); + + if (!(t instanceof QueryToken.LiteralTerm lt)) + return; + + String str = lt.str(); + if (str.isBlank()) + return; + + if (str.endsWith(":") || str.endsWith(".")) { + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); + } + + } + + private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (!(t instanceof QueryToken.Minus)) + return; + if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm)) + return; + + first.remove(); + + second.replace(new QueryToken.ExcludeTerm(tn.str(), "-" + tn.displayStr())); + } + + private static void createPriorityTerms(TransformList.Entity first, TransformList.Entity second) { + var t = first.value(); + var tn = second.value(); + + if (!(t instanceof QueryToken.QMark)) + return; + if (!(tn instanceof QueryToken.LiteralTerm) && !(tn instanceof QueryToken.AdviceTerm)) + return; + + var replacement = new QueryToken.PriorityTerm(tn.str(), "?" + tn.displayStr()); + + first.remove(); + second.replace(replacement); + } + + private static void handleSpecialOperations(TransformList.Entity entity) { + var t = entity.value(); + if (!(t instanceof QueryToken.LiteralTerm)) { + return; + } + + String str = t.str(); + + if (str.startsWith("q") && str.matches("q[=><]\\d+")) { + entity.replace(new QueryToken.QualityTerm(str.substring(1))); + } else if (str.startsWith("near:")) { + entity.replace(new QueryToken.NearTerm(str.substring(5))); + } else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) { + entity.replace(new QueryToken.YearTerm(str.substring(4))); + } else if (str.startsWith("size") && str.matches("size[=><]\\d+")) { + entity.replace(new QueryToken.SizeTerm(str.substring(4))); + } else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) { + entity.replace(new QueryToken.RankTerm(str.substring(4))); + } else if (str.startsWith("qs=")) { + entity.replace(new QueryToken.QsTerm(str.substring(3))); + } else if (str.contains(":")) { + entity.replace(new QueryToken.AdviceTerm(str, t.displayStr())); } } + private static void handleAdvisoryTerms(TransformList.Entity entity) { + var t = entity.value(); + if (t instanceof QueryToken.LParen) { + entity.remove(); + } else if (t instanceof QueryToken.RParen) { + entity.remove(); + } else if (t instanceof QueryToken.LiteralTerm) { + entity.replace(new QueryToken.AdviceTerm(t.str(), "(" + t.displayStr() + ")")); + } + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java index b7b0a2b7..b12d68a9 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java @@ -1,7 +1,6 @@ package nu.marginalia.functions.searchquery.query_parser; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.encoding.AsciiFlattener; import java.util.ArrayList; @@ -11,8 +10,8 @@ import java.util.regex.Pattern; public class QueryTokenizer { private static final Pattern noisePattern = Pattern.compile("[,\\s]"); - public List tokenizeQuery(String rawQuery) { - List tokens = new ArrayList<>(); + public List tokenizeQuery(String rawQuery) { + List tokens = new ArrayList<>(); String query = AsciiFlattener.flattenUnicode(rawQuery); query = noisePattern.matcher(query).replaceAll(" "); @@ -21,26 +20,27 @@ public class QueryTokenizer { int chr = query.charAt(i); if ('(' == chr) { - tokens.add(new Token(TokenType.LPAREN, "(", "(")); + tokens.add(new QueryToken.LParen()); } else if (')' == chr) { - tokens.add(new Token(TokenType.RPAREN, ")", ")")); + tokens.add(new QueryToken.RParen()); } else if ('"' == chr) { int end = query.indexOf('"', i+1); + if (end == -1) { end = query.length(); } - tokens.add(new Token(TokenType.QUOT, - query.substring(i+1, end).toLowerCase(), - query.substring(i, Math.min(query.length(), end+1)))); + + tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); + i = end; } else if ('-' == chr) { - tokens.add(new Token(TokenType.MINUS, "-")); + tokens.add(new QueryToken.Minus()); } else if ('?' == chr) { - tokens.add(new Token(TokenType.QMARK, "?")); + tokens.add(new QueryToken.QMark()); } else if (Character.isSpaceChar(chr)) { // @@ -52,9 +52,12 @@ public class QueryTokenizer { if (query.charAt(end) == ' ' || query.charAt(end) == ')') break; } - tokens.add(new Token(TokenType.LITERAL_TERM, - query.substring(i, end).toLowerCase(), - query.substring(i, end))); + + String displayStr = query.substring(i, end); + String str = displayStr.toLowerCase(); + + tokens.add(new QueryToken.LiteralTerm(str, displayStr)); + i = end-1; } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java new file mode 100644 index 00000000..b11fe370 --- /dev/null +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java @@ -0,0 +1,86 @@ +package nu.marginalia.functions.searchquery.query_parser.token; + + +public sealed interface QueryToken { + String str(); + String displayStr(); + + record LiteralTerm(String str, String displayStr) implements QueryToken {} + record QuotTerm(String str, String displayStr) implements QueryToken {} + record ExcludeTerm(String str, String displayStr) implements QueryToken {} + record AdviceTerm(String str, String displayStr) implements QueryToken {} + record PriorityTerm(String str, String displayStr) implements QueryToken {} + + record QualityTerm(String str) implements QueryToken { + public String displayStr() { + return "q" + str; + } + } + record YearTerm(String str) implements QueryToken { + public String displayStr() { + return "year" + str; + } + } + record SizeTerm(String str) implements QueryToken { + public String displayStr() { + return "size" + str; + } + } + record RankTerm(String str) implements QueryToken { + public String displayStr() { + return "rank" + str; + } + } + record NearTerm(String str) implements QueryToken { + public String displayStr() { + return "near:" + str; + } + } + + record QsTerm(String str) implements QueryToken { + public String displayStr() { + return "qs" + str; + } + } + + record Quot(String str) implements QueryToken { + public String displayStr() { + return "\"" + str + "\""; + } + } + record Minus() implements QueryToken { + public String str() { + return "-"; + } + public String displayStr() { + return "-"; + } + } + record QMark() implements QueryToken { + public String str() { + return "?"; + } + public String displayStr() { + return "?"; + } + } + record LParen() implements QueryToken { + public String str() { + return "("; + } + public String displayStr() { + return "("; + } + } + record RParen() implements QueryToken { + public String str() { + return ")"; + } + public String displayStr() { + return ")"; + } + } + + record Ignore(String str, String displayStr) implements QueryToken {} + +} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java deleted file mode 100644 index 06c28972..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/Token.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -import lombok.EqualsAndHashCode; -import lombok.ToString; -import lombok.With; - -@ToString -@EqualsAndHashCode -@With -public class Token { - public TokenType type; - public String str; - public final String displayStr; - - public Token(TokenType type, String str, String displayStr) { - this.type = type; - this.str = str; - this.displayStr = safeString(displayStr); - } - - - public Token(TokenType type, String str) { - this.type = type; - this.str = str; - this.displayStr = safeString(str); - } - - private static String safeString(String s) { - return s.replaceAll("<", "<") - .replaceAll(">", ">"); - } - - public void visit(TokenVisitor visitor) { - switch (type) { - case QUOT_TERM: visitor.onQuotTerm(this); break; - case EXCLUDE_TERM: visitor.onExcludeTerm(this); break; - case PRIORTY_TERM: visitor.onPriorityTerm(this); break; - case ADVICE_TERM: visitor.onAdviceTerm(this); break; - case LITERAL_TERM: visitor.onLiteralTerm(this); break; - - case YEAR_TERM: visitor.onYearTerm(this); break; - case RANK_TERM: visitor.onRankTerm(this); break; - case SIZE_TERM: visitor.onSizeTerm(this); break; - case QS_TERM: visitor.onQsTerm(this); break; - - case QUALITY_TERM: visitor.onQualityTerm(this); break; - } - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java deleted file mode 100644 index 85d55c35..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenType.java +++ /dev/null @@ -1,34 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -import java.util.function.Predicate; - -public enum TokenType implements Predicate { - TERM, - - - LITERAL_TERM, - QUOT_TERM, - EXCLUDE_TERM, - ADVICE_TERM, - PRIORTY_TERM, - - QUALITY_TERM, - YEAR_TERM, - SIZE_TERM, - RANK_TERM, - NEAR_TERM, - - QS_TERM, - - QUOT, - MINUS, - QMARK, - LPAREN, - RPAREN, - - IGNORE; - - public boolean test(Token t) { - return t.type == this; - } -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java deleted file mode 100644 index 2e14f837..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/TokenVisitor.java +++ /dev/null @@ -1,14 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser.token; - -public interface TokenVisitor { - void onLiteralTerm(Token token); - void onQuotTerm(Token token); - void onExcludeTerm(Token token); - void onPriorityTerm(Token token); - void onAdviceTerm(Token token); - void onYearTerm(Token token); - void onSizeTerm(Token token); - void onRankTerm(Token token); - void onQualityTerm(Token token); - void onQsTerm(Token token); -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 55467b4f..26af1bf4 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -6,18 +6,19 @@ import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.WordPatterns; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.functions.searchquery.query_parser.QueryParser; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenType; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.List; @Singleton @@ -46,31 +47,89 @@ public class QueryFactory { List searchTermsHuman = new ArrayList<>(); List problems = new ArrayList<>(); - List basicQuery = queryParser.parse(query); + List basicQuery = queryParser.parse(query); if (basicQuery.size() >= 12) { problems.add("Your search query is too long"); basicQuery.clear(); } + List searchTermsExclude = new ArrayList<>(); + List searchTermsInclude = new ArrayList<>(); + List searchTermsAdvice = new ArrayList<>(); + List searchTermsPriority = new ArrayList<>(); + List> searchTermCoherences = new ArrayList<>(); - QueryLimitsAccumulator qualityLimits = new QueryLimitsAccumulator(params); + SpecificationLimit qualityLimit = SpecificationLimit.none(); + SpecificationLimit year = SpecificationLimit.none(); + SpecificationLimit size = SpecificationLimit.none(); + SpecificationLimit rank = SpecificationLimit.none(); + QueryStrategy queryStrategy = QueryStrategy.AUTO; - for (Token t : basicQuery) { - if (t.type == TokenType.QUOT_TERM || t.type == TokenType.LITERAL_TERM) { - if (t.str.startsWith("site:")) { - t.str = normalizeDomainName(t.str); + String domain = null; + + System.out.println(basicQuery); + + for (QueryToken t : basicQuery) { + switch (t) { + case QueryToken.QuotTerm(String str, String displayStr) -> { + analyzeSearchTerm(problems, str, displayStr); + searchTermsHuman.addAll(Arrays.asList(displayStr.replace("\"", "").split("\\s+"))); + + String[] parts = StringUtils.split(str, '_'); + + // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being + // required in the query (which is a problem because they are not indexed). How to do this + // in a clean way is a bit of an open problem that may not get resolved until query-parsing is + // improved. + + if (parts.length > 1 && !anyPartIsStopWord(parts)) { + // Prefer that the actual n-gram is present + searchTermsAdvice.add(str); + + // Require that the terms appear in the same sentence + searchTermCoherences.add(Arrays.asList(parts)); + + // Require that each term exists in the document + // (needed for ranking) + searchTermsInclude.addAll(Arrays.asList(parts)); + } + else { + searchTermsInclude.add(str); + } + } + case QueryToken.LiteralTerm(String str, String displayStr) -> { + analyzeSearchTerm(problems, str, displayStr); + searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+"))); + + searchTermsInclude.add(str); } - searchTermsHuman.addAll(toHumanSearchTerms(t)); - analyzeSearchTerm(problems, t); - } - t.visit(qualityLimits); + case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str); + case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str); + case QueryToken.AdviceTerm(String str, String displayStr) -> { + searchTermsAdvice.add(str); + + if (str.toLowerCase().startsWith("site:")) { + domain = str.substring("site:".length()); + } + } + + case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str); + case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str); + case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str); + case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str); + case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str); + + default -> {} + } } - QuerySearchTermsAccumulator termsAccumulator = new QuerySearchTermsAccumulator(basicQuery); - String domain = termsAccumulator.domain; + if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { + searchTermsInclude.addAll(searchTermsAdvice); + searchTermsAdvice.clear(); + } List domainIds = params.domainIds(); @@ -80,29 +139,29 @@ public class QueryFactory { limits = limits.forSingleDomain(); } + var searchQuery = new SearchQuery( + queryExpansion.expandQuery( + searchTermsInclude + ), + searchTermsInclude, + searchTermsExclude, + searchTermsAdvice, + searchTermsPriority, + searchTermCoherences + ); + var specsBuilder = SearchSpecification.builder() - .query( - new SearchQuery( - queryExpansion.expandQuery( - termsAccumulator.searchTermsInclude - ), - termsAccumulator.searchTermsInclude, - termsAccumulator.searchTermsExclude, - termsAccumulator.searchTermsAdvice, - termsAccumulator.searchTermsPriority, - termsAccumulator.searchTermCoherences - ) - ) + .query(searchQuery) .humanQuery(query) - .quality(qualityLimits.qualityLimit) - .year(qualityLimits.year) - .size(qualityLimits.size) - .rank(qualityLimits.rank) + .quality(qualityLimit) + .year(year) + .size(size) + .rank(rank) .domains(domainIds) .queryLimits(limits) .searchSetIdentifier(params.identifier()) .rankingParams(ResultRankingParameters.sensibleDefaults()) - .queryStrategy(qualityLimits.queryStrategy); + .queryStrategy(queryStrategy); SearchSpecification specs = specsBuilder.build(); @@ -113,30 +172,52 @@ public class QueryFactory { return new ProcessedQuery(specs, searchTermsHuman, domain); } - private String normalizeDomainName(String str) { - return str.toLowerCase(); - } - - private List toHumanSearchTerms(Token t) { - if (t.type == TokenType.LITERAL_TERM) { - return Arrays.asList(t.displayStr.split("\\s+")); - } - else if (t.type == TokenType.QUOT_TERM) { - return Arrays.asList(t.displayStr.replace("\"", "").split("\\s+")); - - } - return Collections.emptyList(); - } - - private void analyzeSearchTerm(List problems, Token term) { - final String word = term.str; + private void analyzeSearchTerm(List problems, String str, String displayStr) { + final String word = str; if (word.length() < WordPatterns.MIN_WORD_LENGTH) { - problems.add("Search term \"" + term.displayStr + "\" too short"); + problems.add("Search term \"" + displayStr + "\" too short"); } if (!word.contains("_") && word.length() >= WordPatterns.MAX_WORD_LENGTH) { - problems.add("Search term \"" + term.displayStr + "\" too long"); + problems.add("Search term \"" + displayStr + "\" too long"); + } + } + private SpecificationLimit parseSpecificationLimit(String str) { + int startChar = str.charAt(0); + + int val = Integer.parseInt(str.substring(1)); + if (startChar == '=') { + return SpecificationLimit.equals(val); + } else if (startChar == '<') { + return SpecificationLimit.lessThan(val); + } else if (startChar == '>') { + return SpecificationLimit.greaterThan(val); + } else { + return SpecificationLimit.none(); } } + private QueryStrategy parseQueryStrategy(String str) { + return switch (str.toUpperCase()) { + case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; + case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; + case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; + case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; + case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; + case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; + case "SENTENCE" -> QueryStrategy.SENTENCE; + case "TOPIC" -> QueryStrategy.TOPIC; + default -> QueryStrategy.AUTO; + }; + } + + + private boolean anyPartIsStopWord(String[] parts) { + for (String part : parts) { + if (WordPatterns.isStopWord(part)) { + return true; + } + } + return false; + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java deleted file mode 100644 index 1b49bab3..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryLimitsAccumulator.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.functions.searchquery.svc; - -import nu.marginalia.api.searchquery.model.query.QueryParams; -import nu.marginalia.index.query.limit.QueryStrategy; -import nu.marginalia.index.query.limit.SpecificationLimit; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; - -public class QueryLimitsAccumulator implements TokenVisitor { - public SpecificationLimit qualityLimit; - public SpecificationLimit year; - public SpecificationLimit size; - public SpecificationLimit rank; - - public QueryStrategy queryStrategy = QueryStrategy.AUTO; - - public QueryLimitsAccumulator(QueryParams params) { - qualityLimit = params.quality(); - year = params.year(); - size = params.size(); - rank = params.rank(); - } - - private SpecificationLimit parseSpecificationLimit(String str) { - int startChar = str.charAt(0); - - int val = Integer.parseInt(str.substring(1)); - if (startChar == '=') { - return SpecificationLimit.equals(val); - } else if (startChar == '<') { - return SpecificationLimit.lessThan(val); - } else if (startChar == '>') { - return SpecificationLimit.greaterThan(val); - } else { - return SpecificationLimit.none(); - } - } - - private QueryStrategy parseQueryStrategy(String str) { - return switch (str.toUpperCase()) { - case "RF_TITLE" -> QueryStrategy.REQUIRE_FIELD_TITLE; - case "RF_SUBJECT" -> QueryStrategy.REQUIRE_FIELD_SUBJECT; - case "RF_SITE" -> QueryStrategy.REQUIRE_FIELD_SITE; - case "RF_URL" -> QueryStrategy.REQUIRE_FIELD_URL; - case "RF_DOMAIN" -> QueryStrategy.REQUIRE_FIELD_DOMAIN; - case "RF_LINK" -> QueryStrategy.REQUIRE_FIELD_LINK; - case "SENTENCE" -> QueryStrategy.SENTENCE; - case "TOPIC" -> QueryStrategy.TOPIC; - default -> QueryStrategy.AUTO; - }; - } - - @Override - public void onYearTerm(Token token) { - year = parseSpecificationLimit(token.str); - } - - @Override - public void onSizeTerm(Token token) { - size = parseSpecificationLimit(token.str); - } - - @Override - public void onRankTerm(Token token) { - rank = parseSpecificationLimit(token.str); - } - - @Override - public void onQualityTerm(Token token) { - qualityLimit = parseSpecificationLimit(token.str); - } - - @Override - public void onQsTerm(Token token) { - queryStrategy = parseQueryStrategy(token.str); - } - - - @Override - public void onLiteralTerm(Token token) {} - - @Override - public void onQuotTerm(Token token) {} - - @Override - public void onExcludeTerm(Token token) {} - - @Override - public void onPriorityTerm(Token token) {} - - @Override - public void onAdviceTerm(Token token) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java deleted file mode 100644 index cc3a7e56..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QuerySearchTermsAccumulator.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.functions.searchquery.svc; - -import nu.marginalia.api.searchquery.model.query.SearchQuery; -import nu.marginalia.language.WordPatterns; -import nu.marginalia.functions.searchquery.query_parser.token.Token; -import nu.marginalia.functions.searchquery.query_parser.token.TokenVisitor; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -/** @see SearchQuery */ -public class QuerySearchTermsAccumulator implements TokenVisitor { - public List searchTermsExclude = new ArrayList<>(); - public List searchTermsInclude = new ArrayList<>(); - public List searchTermsAdvice = new ArrayList<>(); - public List searchTermsPriority = new ArrayList<>(); - public List> searchTermCoherences = new ArrayList<>(); - - public String domain; - - public QuerySearchTermsAccumulator(List parts) { - for (Token t : parts) { - t.visit(this); - } - - if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { - searchTermsInclude.addAll(searchTermsAdvice); - searchTermsAdvice.clear(); - } - - } - - @Override - public void onLiteralTerm(Token token) { - searchTermsInclude.add(token.str); - } - - @Override - public void onQuotTerm(Token token) { - String[] parts = token.str.split("_"); - - // HACK (2023-05-02 vlofgren) - // - // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being - // required in the query (which is a problem because they are not indexed). How to do this - // in a clean way is a bit of an open problem that may not get resolved until query-parsing is - // improved. - - if (parts.length > 1 && !anyPartIsStopWord(parts)) { - // Prefer that the actual n-gram is present - searchTermsAdvice.add(token.str); - - // Require that the terms appear in the same sentence - searchTermCoherences.add(Arrays.asList(parts)); - - // Require that each term exists in the document - // (needed for ranking) - searchTermsInclude.addAll(Arrays.asList(parts)); - } - else { - searchTermsInclude.add(token.str); - - } - } - - private boolean anyPartIsStopWord(String[] parts) { - for (String part : parts) { - if (WordPatterns.isStopWord(part)) { - return true; - } - } - return false; - } - - @Override - public void onExcludeTerm(Token token) { - searchTermsExclude.add(token.str); - } - - @Override - public void onPriorityTerm(Token token) { - searchTermsPriority.add(token.str); - } - - @Override - public void onAdviceTerm(Token token) { - searchTermsAdvice.add(token.str); - - if (token.str.toLowerCase().startsWith("site:")) { - domain = token.str.substring("site:".length()); - } - } - - @Override - public void onYearTerm(Token token) {} - @Override - public void onSizeTerm(Token token) {} - @Override - public void onRankTerm(Token token) {} - @Override - public void onQualityTerm(Token token) {} - @Override - public void onQsTerm(Token token) {} -} diff --git a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java index 08bc428e..62dd2e0a 100644 --- a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java +++ b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java @@ -80,6 +80,15 @@ public class TransformList { iter.remove(); } } + else if (firstEntity.action == Action.NO_OP) { + if (secondEntry.action == Action.REPLACE) { + backingList.set(iter.nextIndex(), secondEntry.value); + } + else if (secondEntry.action == Action.REMOVE) { + iter.next(); + iter.remove(); + } + } } } From 7dd8c78c6b28bfd3ddc19f9390b45dfc4315aa39 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 18:12:01 +0200 Subject: [PATCH 28/90] (ngrams) Remove the vestigial logic for capturing permutations of n-grams The change also reduces the object churn in NGramLexicon, as this is a very hot method in the converter. --- .../task/ExportSegmentationModelActor.java | 4 +- .../query_parser/QueryExpansion.java | 9 +- .../segmentation/NgramExporterMain.java | 46 --------- .../segmentation/NgramExtractorMain.java | 7 +- .../marginalia/segmentation/NgramLexicon.java | 96 +++++++++---------- .../segmentation/NgramLexiconTest.java | 18 +--- 6 files changed, 60 insertions(+), 120 deletions(-) delete mode 100644 code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java diff --git a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java index 90baf009..98cf114e 100644 --- a/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ExportSegmentationModelActor.java @@ -21,6 +21,7 @@ public class ExportSegmentationModelActor extends RecordActorPrototype { private final Logger logger = LoggerFactory.getLogger(getClass()); public record Export(String zimFile) implements ActorStep {} + @Override public ActorStep transition(ActorStep self) throws Exception { return switch(self) { @@ -29,9 +30,8 @@ public class ExportSegmentationModelActor extends RecordActorPrototype { var storage = storageService.allocateStorage(FileStorageType.EXPORT, "segmentation-model", "Segmentation Model Export " + LocalDateTime.now()); Path countsFile = storage.asPath().resolve("ngram-counts.bin"); - Path permutationsFile = storage.asPath().resolve("ngram-permutations.bin"); - NgramExtractorMain.dumpCounts(Path.of(zimFile), countsFile, permutationsFile); + NgramExtractorMain.dumpCounts(Path.of(zimFile), countsFile); yield new End(); } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 052516d8..9c9d81fa 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -112,10 +112,15 @@ public class QueryExpansion { // Look for known segments within the query for (int length = 2; length < Math.min(10, words.length); length++) { - for (var segment : lexicon.findSegments(length, words)) { + for (var segment : lexicon.findSegmentOffsets(length, words)) { + int start = segment.start(); int end = segment.start() + segment.length(); - var word = IntStream.range(start, end).mapToObj(nodes::get).map(QWord::word).collect(Collectors.joining("_")); + + var word = IntStream.range(start, end) + .mapToObj(nodes::get) + .map(QWord::word) + .collect(Collectors.joining("_")); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java deleted file mode 100644 index ee6d2cd5..00000000 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExporterMain.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.segmentation; - -import nu.marginalia.LanguageModels; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.Arrays; -import java.util.Scanner; - -public class NgramExporterMain { - - public static void main(String... args) throws IOException { - trial(); - } - - static void trial() throws IOException { - NgramLexicon lexicon = new NgramLexicon( - LanguageModels.builder() - .segments(Path.of("/home/vlofgren/ngram-counts.bin")) - .build() - ); - - System.out.println("Loaded!"); - - var scanner = new Scanner(System.in); - for (;;) { - System.out.println("Enter a sentence: "); - String line = scanner.nextLine(); - System.out.println("."); - if (line == null) - break; - - String[] terms = BasicSentenceExtractor.getStemmedParts(line); - System.out.println("."); - - for (int i = 2; i< 8; i++) { - lexicon.findSegments(i, terms).forEach(p -> { - System.out.println(STR."\{Arrays.toString(p.project(terms))}: \{p.count()}"); - }); - } - - } - } - - -} diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 577aee6e..3f29c74c 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -115,8 +115,7 @@ public class NgramExtractorMain { } public static void dumpCounts(Path zimInputFile, - Path countsOutputFile, - Path permutationsOutputFile + Path countsOutputFile ) throws IOException, InterruptedException { ZIMReader reader = new ZIMReader(new ZIMFile(zimInputFile.toString())); @@ -143,9 +142,6 @@ public class NgramExtractorMain { for (var hash : orderedHashes) { lexicon.incOrdered(hash); } - for (var hash : unorderedHashes) { - lexicon.addUnordered(hash); - } } }); @@ -153,7 +149,6 @@ public class NgramExtractorMain { } lexicon.saveCounts(countsOutputFile); - lexicon.savePermutations(permutationsOutputFile); } } diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index 91cee314..e7dc1017 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -21,10 +21,8 @@ import java.util.List; @Singleton public class NgramLexicon { private final Long2IntOpenCustomHashMap counts; - private final LongOpenHashSet permutations = new LongOpenHashSet(); private static final HasherGroup orderedHasher = HasherGroup.ordered(); - private static final HasherGroup unorderedHasher = HasherGroup.unordered(); @Inject public NgramLexicon(LanguageModels models) { @@ -48,16 +46,57 @@ public class NgramLexicon { } public List findSegmentsStrings(int minLength, int maxLength, String... parts) { - List segments = new ArrayList<>(); + List segments = new ArrayList<>(); for (int i = minLength; i <= maxLength; i++) { segments.addAll(findSegments(i, parts)); } - return segments.stream().map(seg -> seg.project(parts)).toList(); + return segments; } - public List findSegments(int length, String... parts) { + public List findSegments(int length, String... parts) { + // Don't look for ngrams longer than the sentence + if (parts.length < length) return List.of(); + + List positions = new ArrayList<>(); + + // Hash the parts + long[] hashes = new long[parts.length]; + for (int i = 0; i < hashes.length; i++) { + hashes[i] = HasherGroup.hash(parts[i]); + } + + long ordered = 0; + int i = 0; + + // Prepare by combining up to length hashes + for (; i < length; i++) { + ordered = orderedHasher.apply(ordered, hashes[i]); + } + + // Slide the window and look for matches + for (;; i++) { + int ct = counts.get(ordered); + + if (ct > 0) { + positions.add(Arrays.copyOfRange(parts, i - length, length)); + } + + if (i >= hashes.length) + break; + + // Remove the oldest hash and add the new one + ordered = orderedHasher.replace(ordered, + hashes[i], + hashes[i - length], + length); + } + + return positions; + } + + public List findSegmentOffsets(int length, String... parts) { // Don't look for ngrams longer than the sentence if (parts.length < length) return List.of(); @@ -70,13 +109,11 @@ public class NgramLexicon { } long ordered = 0; - long unordered = 0; int i = 0; // Prepare by combining up to length hashes for (; i < length; i++) { ordered = orderedHasher.apply(ordered, hashes[i]); - unordered = unorderedHasher.apply(unordered, hashes[i]); } // Slide the window and look for matches @@ -84,10 +121,7 @@ public class NgramLexicon { int ct = counts.get(ordered); if (ct > 0) { - positions.add(new SentenceSegment(i - length, length, ct, PositionType.NGRAM)); - } - else if (permutations.contains(unordered)) { - positions.add(new SentenceSegment(i - length, length, 0, PositionType.PERMUTATION)); + positions.add(new SentenceSegment(i - length, length, ct)); } if (i >= hashes.length) @@ -98,10 +132,6 @@ public class NgramLexicon { hashes[i], hashes[i - length], length); - unordered = unorderedHasher.replace(unordered, - hashes[i], - hashes[i - length], - length); } return positions; @@ -110,20 +140,6 @@ public class NgramLexicon { public void incOrdered(long hashOrdered) { counts.addTo(hashOrdered, 1); } - public void addUnordered(long hashUnordered) { - permutations.add(hashUnordered); - } - - - public void loadPermutations(Path path) throws IOException { - try (var dis = new DataInputStream(Files.newInputStream(path))) { - long size = dis.readInt(); - - for (int i = 0; i < size; i++) { - permutations.add(dis.readLong()); - } - } - } public void saveCounts(Path file) throws IOException { try (var dos = new DataOutputStream(Files.newOutputStream(file, @@ -142,37 +158,17 @@ public class NgramLexicon { }); } } - public void savePermutations(Path file) throws IOException { - try (var dos = new DataOutputStream(Files.newOutputStream(file, - StandardOpenOption.CREATE, - StandardOpenOption.TRUNCATE_EXISTING, - StandardOpenOption.WRITE))) { - dos.writeInt(counts.size()); - permutations.forEach(v -> { - try { - dos.writeLong(v); - } catch (IOException e) { - throw new RuntimeException(e); - } - }); - } - } public void clear() { - permutations.clear(); counts.clear(); } - public record SentenceSegment(int start, int length, int count, PositionType type) { + public record SentenceSegment(int start, int length, int count) { public String[] project(String... parts) { return Arrays.copyOfRange(parts, start, start + length); } } - enum PositionType { - NGRAM, PERMUTATION - } - private static class KeyIsAlreadyHashStrategy implements LongHash.Strategy { @Override public int hashCode(long l) { diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index d5065959..351ce869 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -14,7 +14,6 @@ class NgramLexiconTest { void addNgram(String... ngram) { lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); - lexicon.addUnordered(HasherGroup.unordered().rollingHash(ngram)); } @Test @@ -26,25 +25,16 @@ class NgramLexiconTest { String[] sent = { "hello", "world", "rye", "bread" }; var segments = lexicon.findSegments(2, "hello", "world", "rye", "bread"); - assertEquals(3, segments.size()); + assertEquals(2, segments.size()); - for (int i = 0; i < 3; i++) { + for (int i = 0; i < 2; i++) { var segment = segments.get(i); switch (i) { case 0 -> { - assertArrayEquals(new String[]{"hello", "world"}, segment.project(sent)); - assertEquals(1, segment.count()); - assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + assertArrayEquals(new String[]{"hello", "world"}, segment); } case 1 -> { - assertArrayEquals(new String[]{"world", "rye"}, segment.project(sent)); - assertEquals(0, segment.count()); - assertEquals(NgramLexicon.PositionType.PERMUTATION, segment.type()); - } - case 2 -> { - assertArrayEquals(new String[]{"rye", "bread"}, segment.project(sent)); - assertEquals(1, segment.count()); - assertEquals(NgramLexicon.PositionType.NGRAM, segment.type()); + assertArrayEquals(new String[]{"rye", "bread"}, segment); } } } From 55f627ed4c24331e2a400dd57a3abfa4e0e6e4db Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 11 Apr 2024 18:50:21 +0200 Subject: [PATCH 29/90] (index) Clean up the code --- .../marginalia/index/ReverseIndexReader.java | 56 ++++++++++++------- .../index/index/CombinedIndexReader.java | 1 + .../index/index/IndexQueryBuilderImpl.java | 13 +---- .../index/index/QueryBranchWalker.java | 2 +- .../marginalia/index/index/StatefulIndex.java | 39 +++++++------ .../index/query/IndexQueryBuilder.java | 9 +-- 6 files changed, 64 insertions(+), 56 deletions(-) diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java index e37de80d..72feb7fd 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java @@ -68,8 +68,12 @@ public class ReverseIndexReader { } - long wordOffset(long wordId) { - long idx = wordsBTreeReader.findEntry(wordId); + /** Calculate the offset of the word in the documents. + * If the return-value is negative, the term does not exist + * in the index. + */ + long wordOffset(long termId) { + long idx = wordsBTreeReader.findEntry(termId); if (idx < 0) return -1L; @@ -77,37 +81,43 @@ public class ReverseIndexReader { return words.get(wordsDataOffset + idx + 1); } - public EntrySource documents(long wordId) { + public EntrySource documents(long termId) { if (null == words) { logger.warn("Reverse index is not ready, dropping query"); return new EmptyEntrySource(); } - long offset = wordOffset(wordId); + long offset = wordOffset(termId); - if (offset < 0) return new EmptyEntrySource(); + if (offset < 0) // No documents + return new EmptyEntrySource(); - return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, wordId); + return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, termId); } - public QueryFilterStepIf also(long wordId) { - long offset = wordOffset(wordId); + /** Create a filter step requiring the specified termId to exist in the documents */ + public QueryFilterStepIf also(long termId) { + long offset = wordOffset(termId); - if (offset < 0) return new QueryFilterNoPass(); + if (offset < 0) // No documents + return new QueryFilterNoPass(); - return new ReverseIndexRetainFilter(createReaderNew(offset), name, wordId); + return new ReverseIndexRetainFilter(createReaderNew(offset), name, termId); } - public QueryFilterStepIf not(long wordId) { - long offset = wordOffset(wordId); + /** Create a filter step requiring the specified termId to be absent from the documents */ + public QueryFilterStepIf not(long termId) { + long offset = wordOffset(termId); - if (offset < 0) return new QueryFilterLetThrough(); + if (offset < 0) // No documents + return new QueryFilterLetThrough(); return new ReverseIndexRejectFilter(createReaderNew(offset)); } - public int numDocuments(long wordId) { - long offset = wordOffset(wordId); + /** Return the number of documents with the termId in the index */ + public int numDocuments(long termId) { + long offset = wordOffset(termId); if (offset < 0) return 0; @@ -115,15 +125,20 @@ public class ReverseIndexReader { return createReaderNew(offset).numEntries(); } + /** Create a BTreeReader for the document offset associated with a termId */ private BTreeReader createReaderNew(long offset) { - return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset); + return new BTreeReader( + documents, + ReverseIndexParameters.docsBTreeContext, + offset); } - public long[] getTermMeta(long wordId, long[] docIds) { - long offset = wordOffset(wordId); + public long[] getTermMeta(long termId, long[] docIds) { + long offset = wordOffset(termId); if (offset < 0) { - logger.debug("Missing offset for word {}", wordId); + // This is likely a bug in the code, but we can't throw an exception here + logger.debug("Missing offset for word {}", termId); return new long[docIds.length]; } @@ -136,10 +151,9 @@ public class ReverseIndexReader { private boolean isUniqueAndSorted(long[] ids) { if (ids.length == 0) return true; - long prev = ids[0]; for (int i = 1; i < ids.length; i++) { - if(ids[i] <= prev) + if(ids[i] <= ids[i-1]) return false; } diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 3846bad8..27a631f5 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -41,6 +41,7 @@ public class CombinedIndexReader { public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } + public QueryFilterStepIf hasWordPrio(long termId) { return reverseIndexPriorityReader.also(termId); } diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 33ca033e..0f63fdbc 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -36,7 +36,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoFull(long termId) { + public IndexQueryBuilder also(long termId) { if (alreadyConsideredTerms.add(termId)) { query.addInclusionFilter(reverseIndexFullReader.also(termId)); @@ -45,16 +45,7 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { return this; } - public IndexQueryBuilder alsoPrio(long termId) { - - if (alreadyConsideredTerms.add(termId)) { - query.addInclusionFilter(reverseIndexPrioReader.also(termId)); - } - - return this; - } - - public IndexQueryBuilder notFull(long termId) { + public IndexQueryBuilder not(long termId) { query.addInclusionFilter(reverseIndexFullReader.not(termId)); diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java index 34b04f0a..ffaa5176 100644 --- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java +++ b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java @@ -75,7 +75,7 @@ public class QueryBranchWalker { // in practice only when an index doesn't contain all the search terms, so we can just // skip those paths if (!remainingPaths.isEmpty()) { - logger.info("Dropping: {}", remainingPaths); + logger.debug("Dropping: {}", remainingPaths); } return ret; diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 273da2d0..ae7b1353 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -125,59 +125,65 @@ public class StatefulIndex { // the term is missing from the index and can never be found paths.removeIf(containsAll(termPriority).negate()); - List helpers = QueryBranchWalker.create(termPriority, paths); + List walkers = QueryBranchWalker.create(termPriority, paths); - for (var helper : helpers) { + for (var walker : walkers) { for (var builder : List.of( - combinedIndexReader.findPriorityWord(helper.termId), - combinedIndexReader.findFullWord(helper.termId) + combinedIndexReader.findPriorityWord(walker.termId), + combinedIndexReader.findFullWord(walker.termId) )) { queryHeads.add(builder); - if (helper.atEnd()) - continue; + if (walker.atEnd()) + continue; // Single term search query + // Add filter steps for the remaining combinations of terms List filterSteps = new ArrayList<>(); - for (var step : helper.next()) { + for (var step : walker.next()) { filterSteps.add(createFilter(step, 0)); } builder.addInclusionFilterAny(filterSteps); } } - List ret = new ArrayList<>(10); // Add additional conditions to the query heads for (var query : queryHeads) { // Advice terms are a special case, mandatory but not ranked, and exempt from re-writing for (long term : terms.advice()) { - query = query.alsoFull(term); + query = query.also(term); } for (long term : terms.excludes()) { - query = query.notFull(term); + query = query.not(term); } // Run these filter steps last, as they'll worst-case cause as many page faults as there are // items in the buffer - ret.add(query.addInclusionFilter(combinedIndexReader.filterForParams(params)).build()); + query.addInclusionFilter(combinedIndexReader.filterForParams(params)); } - - return ret; + return queryHeads + .stream() + .map(IndexQueryBuilder::build) + .toList(); } /** Recursively create a filter step based on the QBW and its children */ private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) { + + // Create a filter for the current termId final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth); var childSteps = walker.next(); - - if (childSteps.isEmpty()) + if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition return ownFilterCondition; + // If there are children, we append the filter conditions for each child as an anyOf condition + // to the current filter condition + List combinedFilters = new ArrayList<>(); for (var step : childSteps) { @@ -186,6 +192,7 @@ public class StatefulIndex { combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter)); } + // Flatten the filter conditions if there's only one branch if (combinedFilters.size() == 1) return combinedFilters.getFirst(); else @@ -196,7 +203,7 @@ public class StatefulIndex { private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) { if (depth < 2) { // At shallow depths we prioritize terms that appear in the priority index, - // to increase the odds we find "good" results before the sand runs out + // to increase the odds we find "good" results before the execution timer runs out return new QueryFilterAnyOf( combinedIndexReader.hasWordPrio(walker.termId), combinedIndexReader.hasWordFull(walker.termId) diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java index 74ebdea1..855309fa 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQueryBuilder.java @@ -11,16 +11,11 @@ import java.util.List; public interface IndexQueryBuilder { /** Filters documents that also contain termId, within the full index. */ - IndexQueryBuilder alsoFull(long termId); - - /** - * Filters documents that also contain the termId, within the priority index. - */ - IndexQueryBuilder alsoPrio(long termIds); + IndexQueryBuilder also(long termId); /** Excludes documents that contain termId, within the full index */ - IndexQueryBuilder notFull(long termId); + IndexQueryBuilder not(long termId); IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep); IndexQueryBuilder addInclusionFilterAny(List filterStep); From a0d9e66ff751b6b71b16d544289defe5d4d40aac Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 10:13:25 +0200 Subject: [PATCH 30/90] (ngram) Fix index range in NgramLexicon to an avoid exception --- .../java/nu/marginalia/segmentation/NgramLexicon.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index e7dc1017..7a6beeb8 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -80,7 +80,7 @@ public class NgramLexicon { int ct = counts.get(ordered); if (ct > 0) { - positions.add(Arrays.copyOfRange(parts, i - length, length)); + positions.add(Arrays.copyOfRange(parts, i - length, i)); } if (i >= hashes.length) From c96da0ce1e50b09e629cb13c517e550ebd5337cb Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:44:14 +0200 Subject: [PATCH 31/90] (segmentation) Pick best segmentation using |s|^|s|-style normalization This is better than doing all segmentations possible at the same time. --- .../query_parser/QueryExpansion.java | 72 ++++++++++++++++--- .../query/svc/QueryFactoryTest.java | 17 +++++ 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 9c9d81fa..80d8c8f3 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -9,8 +9,7 @@ import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; -import java.util.ArrayList; -import java.util.List; +import java.util.*; import java.util.regex.Pattern; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -110,21 +109,72 @@ public class QueryExpansion { String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); - // Look for known segments within the query + // Grab all segments + + List allSegments = new ArrayList<>(); for (int length = 2; length < Math.min(10, words.length); length++) { - for (var segment : lexicon.findSegmentOffsets(length, words)) { + allSegments.addAll(lexicon.findSegmentOffsets(length, words)); + } + allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); - int start = segment.start(); - int end = segment.start() + segment.length(); + if (allSegments.isEmpty()) { + return; + } - var word = IntStream.range(start, end) - .mapToObj(nodes::get) - .map(QWord::word) - .collect(Collectors.joining("_")); + Set bestSegmentation = + findBestSegmentation(allSegments); - graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); + for (var segment : bestSegmentation) { + + int start = segment.start(); + int end = segment.start() + segment.length(); + + var word = IntStream.range(start, end) + .mapToObj(nodes::get) + .map(QWord::word) + .collect(Collectors.joining("_")); + + System.out.println(word); + + graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); + } + + } + + private Set findBestSegmentation(List allSegments) { + Set bestSet = Set.of(); + double bestScore = Double.MIN_VALUE; + + for (int i = 0; i < allSegments.size(); i++) { + Set parts = new HashSet<>(); + parts.add(allSegments.get(i)); + + outer: + for (int j = i+1; j < allSegments.size(); j++) { + var candidate = allSegments.get(j); + for (var part : parts) { + if (part.overlaps(candidate)) { + continue outer; + } + } + parts.add(candidate); + } + + double score = 0.; + for (var part : parts) { + // |s|^|s|-normalization per M Hagen et al + double normFactor = Math.pow(part.count(), part.count()); + + score += normFactor * part.count(); + } + + if (bestScore < score) { + bestScore = score; + bestSet = parts; } } + + return bestSet; } public interface ExpansionStrategy { diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 132944c4..622130b7 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -16,6 +16,8 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -52,6 +54,21 @@ public class QueryFactoryTest { ResultRankingParameters.TemporalBias.NONE)).specs; } + + @Test + void qsec10() { + try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) { + lines.limit(1000).forEach(line -> { + String[] parts = line.split("\t"); + if (parts.length == 2) { + System.out.println(parseAndGetSpecs(parts[1]).getQuery().compiledQuery); + } + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + @Test public void testParseNoSpecials() { var year = parseAndGetSpecs("in the year 2000").year; From 150ee21f3c4eda5482523e0f67a986ffb4a3facd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:45:06 +0200 Subject: [PATCH 32/90] (ngram) Clean up ngram lexicon code This is both an optimization that removes some GC churn, as well as a clean-up of the code that removes references to outdated concepts. --- .../marginalia/segmentation/NgramLexicon.java | 82 +++++++++---------- .../segmentation/NgramLexiconTest.java | 5 +- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index 7a6beeb8..5a82ab3e 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -4,7 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.Long2IntOpenCustomHashMap; import it.unimi.dsi.fastutil.longs.LongHash; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.LanguageModels; import java.io.BufferedInputStream; @@ -45,55 +44,54 @@ public class NgramLexicon { counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); } - public List findSegmentsStrings(int minLength, int maxLength, String... parts) { + public List findSegmentsStrings(int minLength, + int maxLength, + String... parts) + { List segments = new ArrayList<>(); - for (int i = minLength; i <= maxLength; i++) { - segments.addAll(findSegments(i, parts)); - } - - return segments; - } - - public List findSegments(int length, String... parts) { - // Don't look for ngrams longer than the sentence - if (parts.length < length) return List.of(); - - List positions = new ArrayList<>(); - // Hash the parts long[] hashes = new long[parts.length]; for (int i = 0; i < hashes.length; i++) { hashes[i] = HasherGroup.hash(parts[i]); } - long ordered = 0; + for (int i = minLength; i <= maxLength; i++) { + findSegments(segments, i, parts, hashes); + } + + return segments; + } + + public void findSegments(List positions, + int length, + String[] parts, + long[] hashes) + { + // Don't look for ngrams longer than the sentence + if (parts.length < length) return; + + long hash = 0; int i = 0; // Prepare by combining up to length hashes for (; i < length; i++) { - ordered = orderedHasher.apply(ordered, hashes[i]); + hash = orderedHasher.apply(hash, hashes[i]); } // Slide the window and look for matches - for (;; i++) { - int ct = counts.get(ordered); - - if (ct > 0) { + for (;;) { + if (counts.get(hash) > 0) { positions.add(Arrays.copyOfRange(parts, i - length, i)); } - if (i >= hashes.length) + if (i < hashes.length) { + hash = orderedHasher.replace(hash, hashes[i], hashes[i - length], length); + i++; + } else { break; - - // Remove the oldest hash and add the new one - ordered = orderedHasher.replace(ordered, - hashes[i], - hashes[i - length], - length); + } } - - return positions; } public List findSegmentOffsets(int length, String... parts) { @@ -108,30 +106,28 @@ public class NgramLexicon { hashes[i] = HasherGroup.hash(parts[i]); } - long ordered = 0; + long hash = 0; int i = 0; // Prepare by combining up to length hashes for (; i < length; i++) { - ordered = orderedHasher.apply(ordered, hashes[i]); + hash = orderedHasher.apply(hash, hashes[i]); } // Slide the window and look for matches - for (;; i++) { - int ct = counts.get(ordered); + for (;;) { + int ct = counts.get(hash); if (ct > 0) { positions.add(new SentenceSegment(i - length, length, ct)); } - if (i >= hashes.length) + if (i < hashes.length) { + hash = orderedHasher.replace(hash, hashes[i], hashes[i - length], length); + i++; + } else { break; - - // Remove the oldest hash and add the new one - ordered = orderedHasher.replace(ordered, - hashes[i], - hashes[i - length], - length); + } } return positions; @@ -167,6 +163,10 @@ public class NgramLexicon { public String[] project(String... parts) { return Arrays.copyOfRange(parts, start, start + length); } + + public boolean overlaps(SentenceSegment other) { + return start < other.start + other.length && start + length > other.start; + } } private static class KeyIsAlreadyHashStrategy implements LongHash.Strategy { diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index 351ce869..f5068d07 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -3,6 +3,8 @@ package nu.marginalia.segmentation; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import java.util.List; + import static org.junit.jupiter.api.Assertions.*; class NgramLexiconTest { @@ -22,8 +24,7 @@ class NgramLexiconTest { addNgram("rye", "bread"); addNgram("rye", "world"); - String[] sent = { "hello", "world", "rye", "bread" }; - var segments = lexicon.findSegments(2, "hello", "world", "rye", "bread"); + List segments = lexicon.findSegmentsStrings(2, 2, "hello", "world", "rye", "bread"); assertEquals(2, segments.size()); From 5531ed632a4df9c37f019a67ba0aa5070f871512 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:45:26 +0200 Subject: [PATCH 33/90] (query, minor) Remove debug statement --- .../nu/marginalia/functions/searchquery/svc/QueryFactory.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 26af1bf4..15596d5c 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -68,8 +68,6 @@ public class QueryFactory { String domain = null; - System.out.println(basicQuery); - for (QueryToken t : basicQuery) { switch (t) { case QueryToken.QuotTerm(String str, String displayStr) -> { From e23359bae929da4be43c1ad2c5a58dff6468b328 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 17:52:55 +0200 Subject: [PATCH 34/90] (query, minor) Remove debug statement --- .../functions/searchquery/query_parser/QueryExpansion.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 80d8c8f3..efdaf328 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -134,8 +134,6 @@ public class QueryExpansion { .map(QWord::word) .collect(Collectors.joining("_")); - System.out.println(word); - graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } From a0b3634cb6e5abc6c6027bccf49957b311a43507 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 12 Apr 2024 18:08:31 +0200 Subject: [PATCH 35/90] (ngram) Only extract frequencies of title words, but use the body to increment the counters... The sign of the counter is used to indicate whether a term has appeared as title. Until it's seen in the title, it's provisionally saved as a negative count. --- .../segmentation/NgramExtractorMain.java | 69 ++++++++----------- .../marginalia/segmentation/NgramLexicon.java | 25 +++++-- 2 files changed, 48 insertions(+), 46 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 3f29c74c..02e2a881 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -8,10 +8,7 @@ import org.openzim.ZIMTypes.ZIMFile; import org.openzim.ZIMTypes.ZIMReader; import java.io.IOException; -import java.io.PrintWriter; -import java.nio.file.Files; import java.nio.file.Path; -import java.nio.file.StandardOpenOption; import java.util.ArrayList; import java.util.List; import java.util.concurrent.Executors; @@ -22,14 +19,20 @@ public class NgramExtractorMain { public static void main(String... args) { } - private static List getNgramTerms(String title, Document document) { + private static List getNgramTitleTerms(String title) { List terms = new ArrayList<>(); // Add the title - if (title.contains(" ")) { + if (title.contains(" ")) { // Only add multi-word titles since we're chasing ngrams terms.add(title.toLowerCase()); } + return cleanTerms(terms); + } + + private static List getNgramBodyTerms(Document document) { + List terms = new ArrayList<>(); + // Grab all internal links document.select("a[href]").forEach(e -> { var href = e.attr("href"); @@ -54,6 +57,10 @@ public class NgramExtractorMain { terms.add(text); }); + return cleanTerms(terms); + } + + private static List cleanTerms(List terms) { // Trim the discovered terms terms.replaceAll(s -> { @@ -85,35 +92,6 @@ public class NgramExtractorMain { return terms; } - public static void dumpNgramsList( - Path zimFile, - Path ngramFile - ) throws IOException, InterruptedException { - ZIMReader reader = new ZIMReader(new ZIMFile(zimFile.toString())); - - PrintWriter printWriter = new PrintWriter(Files.newOutputStream(ngramFile, - StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE)); - - LongOpenHashSet known = new LongOpenHashSet(); - - try (var executor = Executors.newWorkStealingPool()) { - reader.forEachArticles((title, body) -> { - executor.submit(() -> { - var terms = getNgramTerms(title, Jsoup.parse(body)); - synchronized (known) { - for (String term : terms) { - if (known.add(hash.hashNearlyASCII(term))) { - printWriter.println(term); - } - } - } - }); - - }, p -> true); - } - printWriter.close(); - } - public static void dumpCounts(Path zimInputFile, Path countsOutputFile ) throws IOException, InterruptedException @@ -123,24 +101,31 @@ public class NgramExtractorMain { NgramLexicon lexicon = new NgramLexicon(); var orderedHasher = HasherGroup.ordered(); - var unorderedHasher = HasherGroup.unordered(); try (var executor = Executors.newWorkStealingPool()) { reader.forEachArticles((title, body) -> { executor.submit(() -> { - LongArrayList orderedHashes = new LongArrayList(); - LongArrayList unorderedHashes = new LongArrayList(); + LongArrayList orderedHashesTitle = new LongArrayList(); + LongArrayList orderedHashesBody = new LongArrayList(); - for (var sent : getNgramTerms(title, Jsoup.parse(body))) { + for (var sent : getNgramTitleTerms(title)) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); - orderedHashes.add(orderedHasher.rollingHash(terms)); - unorderedHashes.add(unorderedHasher.rollingHash(terms)); + orderedHashesTitle.add(orderedHasher.rollingHash(terms)); + } + + for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + + orderedHashesBody.add(orderedHasher.rollingHash(terms)); } synchronized (lexicon) { - for (var hash : orderedHashes) { - lexicon.incOrdered(hash); + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); + } + for (var hash : orderedHashesBody) { + lexicon.incOrderedBody(hash); } } }); diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index 5a82ab3e..e831e25b 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -42,6 +42,7 @@ public class NgramLexicon { public NgramLexicon() { counts = new Long2IntOpenCustomHashMap(100_000_000, new KeyIsAlreadyHashStrategy()); + counts.defaultReturnValue(0); } public List findSegmentsStrings(int minLength, @@ -133,8 +134,22 @@ public class NgramLexicon { return positions; } - public void incOrdered(long hashOrdered) { - counts.addTo(hashOrdered, 1); + public void incOrderedTitle(long hashOrdered) { + int value = counts.get(hashOrdered); + + if (value < 0) value = -value + 1; + else value ++; + + counts.put(hashOrdered, value); + } + + public void incOrderedBody(long hashOrdered) { + int value = counts.get(hashOrdered); + + if (value <= 0) value --; + else value ++; + + counts.put(hashOrdered, value); } public void saveCounts(Path file) throws IOException { @@ -146,8 +161,10 @@ public class NgramLexicon { counts.forEach((k, v) -> { try { - dos.writeLong(k); - dos.writeInt(v); + if (v > 0) { + dos.writeLong(k); + dos.writeInt(v); + } } catch (IOException e) { throw new RuntimeException(e); } From cb505f98ef992cc8818e3c30acd04a3f20b6e018 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 17:07:23 +0200 Subject: [PATCH 36/90] (ngram) Use simple blocking pool instead of FJP; split on underscores in article names. --- .../term-frequency-dict/build.gradle | 1 + .../segmentation/NgramExtractorMain.java | 63 ++++++++++--------- 2 files changed, 36 insertions(+), 28 deletions(-) diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 67fb44ae..3a9a4d8d 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:libraries:easy-lsh') implementation project(':code:libraries:array') + implementation project(':code:libraries:blocking-thread-pool') implementation libs.bundles.slf4j implementation libs.notnull diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 02e2a881..270117da 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -2,6 +2,7 @@ package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.util.SimpleBlockingThreadPool; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.openzim.ZIMTypes.ZIMFile; @@ -11,12 +12,12 @@ import java.io.IOException; import java.nio.file.Path; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; public class NgramExtractorMain { - static MurmurHash3_128 hash = new MurmurHash3_128(); - - public static void main(String... args) { + public static void main(String... args) throws IOException, InterruptedException { + dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"), + Path.of("/tmp/ngram-counts.bin")); } private static List getNgramTitleTerms(String title) { @@ -102,36 +103,42 @@ public class NgramExtractorMain { var orderedHasher = HasherGroup.ordered(); - try (var executor = Executors.newWorkStealingPool()) { - reader.forEachArticles((title, body) -> { - executor.submit(() -> { - LongArrayList orderedHashesTitle = new LongArrayList(); - LongArrayList orderedHashesBody = new LongArrayList(); + var pool = new SimpleBlockingThreadPool("ngram-extractor", + Math.clamp(2, Runtime.getRuntime().availableProcessors(), 32), + 32 + ); - for (var sent : getNgramTitleTerms(title)) { - String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + reader.forEachArticles((title, body) -> { + pool.submitQuietly(() -> { + LongArrayList orderedHashesTitle = new LongArrayList(); + LongArrayList orderedHashesBody = new LongArrayList(); - orderedHashesTitle.add(orderedHasher.rollingHash(terms)); + String normalizedTitle = title.replace('_', ' '); + + for (var sent : getNgramTitleTerms(normalizedTitle)) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + orderedHashesTitle.add(orderedHasher.rollingHash(terms)); + } + + for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { + String[] terms = BasicSentenceExtractor.getStemmedParts(sent); + orderedHashesBody.add(orderedHasher.rollingHash(terms)); + } + + synchronized (lexicon) { + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); } - - for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { - String[] terms = BasicSentenceExtractor.getStemmedParts(sent); - - orderedHashesBody.add(orderedHasher.rollingHash(terms)); + for (var hash : orderedHashesBody) { + lexicon.incOrderedBody(hash); } + } + }); - synchronized (lexicon) { - for (var hash : orderedHashesTitle) { - lexicon.incOrderedTitle(hash); - } - for (var hash : orderedHashesBody) { - lexicon.incOrderedBody(hash); - } - } - }); + }, p -> true); - }, p -> true); - } + pool.shutDown(); + pool.awaitTermination(10, TimeUnit.DAYS); lexicon.saveCounts(countsOutputFile); } From afc4fed591ff99c99847b8d71c8ef9aec0c2fbf4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 17:51:02 +0200 Subject: [PATCH 37/90] (ngram) Correct size value in ngram lexicon generation, trim the terms better --- .../segmentation/NgramExtractorMain.java | 17 +++++++++---- .../marginalia/segmentation/NgramLexicon.java | 24 ++++++++++++++----- 2 files changed, 31 insertions(+), 10 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index 270117da..f6ba5b08 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -1,7 +1,6 @@ package nu.marginalia.segmentation; import it.unimi.dsi.fastutil.longs.*; -import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.util.SimpleBlockingThreadPool; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; @@ -16,8 +15,6 @@ import java.util.concurrent.TimeUnit; public class NgramExtractorMain { public static void main(String... args) throws IOException, InterruptedException { - dumpCounts(Path.of("/home/vlofgren/Exports/wikipedia_en_all_nopic_2024-02.zim"), - Path.of("/tmp/ngram-counts.bin")); } private static List getNgramTitleTerms(String title) { @@ -64,7 +61,6 @@ public class NgramExtractorMain { private static List cleanTerms(List terms) { // Trim the discovered terms terms.replaceAll(s -> { - // Remove trailing parentheses and their contents if (s.endsWith(")")) { int idx = s.lastIndexOf('('); @@ -73,6 +69,10 @@ public class NgramExtractorMain { } } + return s; + }); + + terms.replaceAll(s -> { // Remove leading "list of " if (s.startsWith("list of ")) { return s.substring("list of ".length()); @@ -81,6 +81,15 @@ public class NgramExtractorMain { return s; }); + terms.replaceAll(s -> { + // Remove trailing punctuation + if (s.endsWith(".") || s.endsWith(",") || s.endsWith(":") || s.endsWith(";")) { + return s.substring(0, s.length() - 1); + } + + return s; + }); + // Remove terms that are too short or too long terms.removeIf(s -> { if (!s.contains(" ")) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java index e831e25b..9b59a84f 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramLexicon.java @@ -21,6 +21,7 @@ import java.util.List; public class NgramLexicon { private final Long2IntOpenCustomHashMap counts; + private int size; private static final HasherGroup orderedHasher = HasherGroup.ordered(); @Inject @@ -31,9 +32,15 @@ public class NgramLexicon { (int) size, new KeyIsAlreadyHashStrategy() ); + counts.defaultReturnValue(0); - for (int i = 0; i < size; i++) { - counts.put(dis.readLong(), dis.readInt()); + try { + for (int i = 0; i < size; i++) { + counts.put(dis.readLong(), dis.readInt()); + } + } + catch (IOException ex) { + ex.printStackTrace(); } } catch (IOException e) { throw new RuntimeException(e); @@ -137,8 +144,12 @@ public class NgramLexicon { public void incOrderedTitle(long hashOrdered) { int value = counts.get(hashOrdered); - if (value < 0) value = -value + 1; - else value ++; + if (value <= 0) { + size ++; + value = -value; + } + + value ++; counts.put(hashOrdered, value); } @@ -147,7 +158,7 @@ public class NgramLexicon { int value = counts.get(hashOrdered); if (value <= 0) value --; - else value ++; + else value++; counts.put(hashOrdered, value); } @@ -157,7 +168,8 @@ public class NgramLexicon { StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE))) { - dos.writeInt(counts.size()); + + dos.writeInt(size); counts.forEach((k, v) -> { try { From 5f6a3ef9d0b7629fde4d4296b7a969ae0ab8fdf8 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 18:05:30 +0200 Subject: [PATCH 38/90] (ngram) Correct |s|^|s|-normalization to use length and not count --- .../functions/searchquery/query_parser/QueryExpansion.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index efdaf328..d4e324fa 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -161,7 +161,7 @@ public class QueryExpansion { double score = 0.; for (var part : parts) { // |s|^|s|-normalization per M Hagen et al - double normFactor = Math.pow(part.count(), part.count()); + double normFactor = Math.pow(part.length(), part.length()); score += normFactor * part.count(); } From 0da03d4cfca4e3d7c5afd8fca76ee652e7c3d0cd Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 19:33:47 +0200 Subject: [PATCH 39/90] (zim) Fix title extractor --- .../java/org/openzim/ZIMTypes/ZIMReader.java | 23 ++----------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java index e2fcaf6e..e9b5cf47 100644 --- a/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java +++ b/third-party/openzim/src/main/java/org/openzim/ZIMTypes/ZIMReader.java @@ -275,9 +275,7 @@ public class ZIMReader { } - - // Gives the minimum required information needed for the given articleName - public DirectoryEntry forEachTitles(Consumer aeConsumer, Consumer reConsumer) + public DirectoryEntry forEachTitles(Consumer titleConsumer) throws IOException { int numberOfArticles = mFile.getArticleCount(); @@ -287,26 +285,9 @@ public class ZIMReader { System.err.println(numberOfArticles); long start = System.currentTimeMillis(); - Map> data = new TreeMap<>(); - - System.err.println("Indexing"); - for (long i = beg; i < end; i+=4) { var entry = getDirectoryInfoAtTitlePosition(i); - - if (((i-beg)%100_000) == 0) { - System.err.printf("%f%%\n", ((i-beg) * 100.) / (end-beg)); - } - - if (entry.mimeType == targetMime && entry instanceof ArticleEntry) { - aeConsumer.accept((ArticleEntry) entry); - } - else if (entry.mimeType == 65535 && entry instanceof RedirectEntry) { - - reConsumer.accept((RedirectEntry) entry); - - } - + titleConsumer.accept(entry.title); } return null; From f3255e080d4ed872c292e5d9a9ecd34b7c8d53e6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sat, 13 Apr 2024 19:34:16 +0200 Subject: [PATCH 40/90] (ngram) Grab titles separately when extracting ngrams from wiki data --- .../segmentation/NgramExtractorMain.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java index f6ba5b08..b0eb6916 100644 --- a/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java +++ b/code/libraries/term-frequency-dict/java/nu/marginalia/segmentation/NgramExtractorMain.java @@ -117,10 +117,9 @@ public class NgramExtractorMain { 32 ); - reader.forEachArticles((title, body) -> { + reader.forEachTitles((title) -> { pool.submitQuietly(() -> { LongArrayList orderedHashesTitle = new LongArrayList(); - LongArrayList orderedHashesBody = new LongArrayList(); String normalizedTitle = title.replace('_', ' '); @@ -128,6 +127,18 @@ public class NgramExtractorMain { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); orderedHashesTitle.add(orderedHasher.rollingHash(terms)); } + synchronized (lexicon) { + for (var hash : orderedHashesTitle) { + lexicon.incOrderedTitle(hash); + } + } + }); + + }); + + reader.forEachArticles((title, body) -> { + pool.submitQuietly(() -> { + LongArrayList orderedHashesBody = new LongArrayList(); for (var sent : getNgramBodyTerms(Jsoup.parse(body))) { String[] terms = BasicSentenceExtractor.getStemmedParts(sent); @@ -135,9 +146,6 @@ public class NgramExtractorMain { } synchronized (lexicon) { - for (var hash : orderedHashesTitle) { - lexicon.incOrderedTitle(hash); - } for (var hash : orderedHashesBody) { lexicon.incOrderedBody(hash); } From 6efc0f21fe97dd3629b092da565fedf707dea73e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Apr 2024 16:04:07 +0200 Subject: [PATCH 41/90] (index) Clean up data model The change set cleans up the data model for the term-level data. This used to contain a bunch of fields with document-level metadata. This data-duplication means a larger memory footprint and worse memory locality. The ranking code is also modified to not accept SearchResultKeywordScores, but rather CompiledQueryLong and CqDataInts containing only the term metadata and the frequency information needed for ranking. This is again an effort to improve memory locality. --- .../nu/marginalia/model/idx/WordFlags.java | 5 + .../api/searchquery/QueryProtobufCodec.java | 6 +- .../model/compiled/CompiledQuery.java | 4 + .../model/compiled/CompiledQueryInt.java | 44 ++++++ .../model/compiled/CompiledQueryLong.java | 8 +- .../searchquery/model/compiled/CqData.java | 11 +- .../searchquery/model/compiled/CqDataInt.java | 31 +++++ .../aggregate/CompiledQueryAggregates.java | 17 ++- .../aggregate/CqBooleanAggregate.java | 6 + .../aggregate/CqDoubleSumOperator.java | 6 + .../aggregate/CqIntMaxMinOperator.java | 6 + .../aggregate/CqLongBitmaskOperator.java | 5 + .../aggregate/CqPositionsOperator.java | 6 + .../model/results/ResultRankingContext.java | 30 ++--- .../model/results/SearchResultItem.java | 11 +- .../results/SearchResultKeywordScore.java | 39 +----- .../api/src/main/protobuf/query-api.proto | 8 +- .../nu/marginalia/index/IndexGrpcService.java | 27 ++-- .../results/IndexResultValuationContext.java | 58 ++++---- .../results/IndexResultValuatorService.java | 38 ++++-- .../ranking/results/ResultValuator.java | 32 ++--- .../ranking/results/factors/Bm25Factor.java | 113 ---------------- .../results/factors/Bm25FullGraphVisitor.java | 81 +++++++++++ .../results/factors/Bm25PrioGraphVisitor.java | 127 ++++++++++++++++++ .../results/factors/TermCoherenceFactor.java | 8 +- ...IndexQueryServiceIntegrationSmokeTest.java | 4 +- .../IndexResultDomainDeduplicatorTest.java | 2 +- .../ranking/results/ResultValuatorTest.java | 49 +++---- .../factors/TermCoherenceFactorTest.java | 19 ++- .../segmentation/NgramLexiconTest.java | 2 +- .../search/model/ClusteredUrlDetails.java | 2 +- 31 files changed, 520 insertions(+), 285 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java delete mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java create mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java create mode 100644 code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index dc627715..db54df77 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -50,6 +50,10 @@ public enum WordFlags { return (asBit() & value) > 0; } + public boolean isAbsent(long value) { + return (asBit() & value) == 0; + } + public static EnumSet decode(long encodedValue) { EnumSet ret = EnumSet.noneOf(WordFlags.class); @@ -61,4 +65,5 @@ public enum WordFlags { return ret; } + } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index b705917e..5a43df1b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -134,6 +134,8 @@ public class QueryProtobufCodec { return new SearchResultItem( rawItem.getCombinedId(), + rawItem.getEncodedDocMetadata(), + rawItem.getHtmlFeatures(), keywordScores, rawItem.getResultsFromDomain(), Double.NaN // Not set @@ -144,9 +146,7 @@ public class QueryProtobufCodec { return new SearchResultKeywordScore( keywordScores.getKeyword(), -1, // termId is internal to index service - keywordScores.getEncodedWordMetadata(), - keywordScores.getEncodedDocMetadata(), - keywordScores.getHtmlFeatures() + keywordScores.getEncodedWordMetadata() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java index 3ae850a3..356a1d86 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -46,6 +46,10 @@ public class CompiledQuery implements Iterable { return new CompiledQueryLong(root, data.mapToLong(mapper)); } + public CompiledQueryLong mapToInt(ToIntFunction mapper) { + return new CompiledQueryLong(root, data.mapToInt(mapper)); + } + public CqExpression root() { return root; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java new file mode 100644 index 00000000..9e26c35c --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java @@ -0,0 +1,44 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.stream.IntStream; + + +/** A compiled index service query */ +public class CompiledQueryInt { + private final CqExpression root; + private final CqDataInt data; + + public CompiledQueryInt(CqExpression root, CqDataInt data) { + this.root = root; + this.data = data; + } + + + public CqExpression root() { + return root; + } + + public IntStream stream() { + return data.stream(); + } + + public IntStream indices() { + return IntStream.range(0, data.size()); + } + + public long at(int index) { + return data.get(index); + } + + public int[] copyData() { + return data.copyData(); + } + + public boolean isEmpty() { + return data.size() == 0; + } + + public int size() { + return data.size(); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java index 94fa0e8b..718aaca7 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryLong.java @@ -9,8 +9,8 @@ import java.util.stream.LongStream; /** A compiled index service query */ public class CompiledQueryLong implements Iterable { - private final CqExpression root; - private final CqDataLong data; + public final CqExpression root; + public final CqDataLong data; public CompiledQueryLong(CqExpression root, CqDataLong data) { this.root = root; @@ -47,4 +47,8 @@ public class CompiledQueryLong implements Iterable { public boolean isEmpty() { return data.size() == 0; } + + public int size() { + return data.size(); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java index b1565dc0..145f3f0f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -3,7 +3,7 @@ package nu.marginalia.api.searchquery.model.compiled; import java.lang.reflect.Array; import java.util.Arrays; import java.util.function.Function; -import java.util.function.ToDoubleFunction; +import java.util.function.ToIntFunction; import java.util.function.ToLongFunction; import java.util.stream.Stream; @@ -33,6 +33,15 @@ public class CqData { return new CqDataLong(newData); } + public CqDataLong mapToInt(ToIntFunction mapper) { + long[] newData = new long[data.length]; + for (int i = 0; i < data.length; i++) { + newData[i] = mapper.applyAsInt((T) data[i]); + } + + return new CqDataLong(newData); + } + public T get(int i) { return data[i]; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java new file mode 100644 index 00000000..24991686 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqDataInt.java @@ -0,0 +1,31 @@ +package nu.marginalia.api.searchquery.model.compiled; + +import java.util.Arrays; +import java.util.stream.IntStream; + +public class CqDataInt { + private final int[] data; + + public CqDataInt(int[] data) { + this.data = data; + } + + public int get(int i) { + return data[i]; + } + public int get(CqExpression.Word w) { + return data[w.idx()]; + } + + public IntStream stream() { + return Arrays.stream(data); + } + + public int size() { + return data.length; + } + + public int[] copyData() { + return Arrays.copyOf(data, data.length); + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 0ab0647d..7e8ca8ec 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -17,6 +17,9 @@ public class CompiledQueryAggregates { static public boolean booleanAggregate(CompiledQuery query, Predicate predicate) { return query.root.visit(new CqBooleanAggregate(query, predicate)); } + static public boolean booleanAggregate(CompiledQueryLong query, LongPredicate predicate) { + return query.root.visit(new CqBooleanAggregate(query, predicate)); + } /** Compiled query aggregate that for a 64b bitmask that treats or-branches as logical OR, @@ -25,13 +28,20 @@ public class CompiledQueryAggregates { public static long longBitmaskAggregate(CompiledQuery query, ToLongFunction operator) { return query.root.visit(new CqLongBitmaskOperator(query, operator)); } - + public static long longBitmaskAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root.visit(new CqLongBitmaskOperator(query, operator)); + } /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); } + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } + /** Apply the operator to each leaf node, and then return the highest sum of values possible * through each branch in the compiled query. * @@ -49,4 +59,9 @@ public class CompiledQueryAggregates { public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { return query.root().visit(new CqPositionsOperator(query, operator)); } + + /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ + public static LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) { + return query.root().visit(new CqPositionsOperator(query, operator)); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java index 05ebf4c7..2a87ec79 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqBooleanAggregate.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntPredicate; +import java.util.function.LongPredicate; import java.util.function.Predicate; public class CqBooleanAggregate implements CqExpression.BoolVisitor { @@ -15,6 +17,10 @@ public class CqBooleanAggregate implements CqExpression.BoolVisitor { this.predicate = idx -> objPred.test(query.at(idx)); } + public CqBooleanAggregate(CompiledQueryLong query, LongPredicate longPredicate) { + this.predicate = idx -> longPredicate.test(query.at(idx)); + } + @Override public boolean onAnd(List parts) { for (var part : parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java index 23d1904e..082de29e 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqDoubleSumOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToDoubleFunction; +import java.util.function.LongToDoubleFunction; import java.util.function.ToDoubleFunction; public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { @@ -15,6 +17,10 @@ public class CqDoubleSumOperator implements CqExpression.DoubleVisitor { this.operator = idx -> operator.applyAsDouble(query.at(idx)); } + public CqDoubleSumOperator(IntToDoubleFunction operator) { + this.operator = operator; + } + @Override public double onAnd(List parts) { double value = 0; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java index b3ec86bb..621dff73 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntUnaryOperator; +import java.util.function.LongToIntFunction; import java.util.function.ToIntFunction; public class CqIntMaxMinOperator implements CqExpression.IntVisitor { @@ -16,6 +18,10 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor { this.operator = idx -> operator.applyAsInt(query.at(idx)); } + public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } + @Override public int onAnd(List parts) { int value = parts.getFirst().visit(this); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java index d9a4804b..b64029c1 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqLongBitmaskOperator.java @@ -1,10 +1,12 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqLongBitmaskOperator implements CqExpression.LongVisitor { @@ -14,6 +16,9 @@ public class CqLongBitmaskOperator implements CqExpression.LongVisitor { public CqLongBitmaskOperator(CompiledQuery query, ToLongFunction operator) { this.operator = idx-> operator.applyAsLong(query.at(idx)); } + public CqLongBitmaskOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx-> operator.applyAsLong(query.at(idx)); + } @Override public long onAnd(List parts) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java index 19db2d4b..715c4cb2 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java @@ -4,10 +4,12 @@ import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; import java.util.List; import java.util.function.IntToLongFunction; +import java.util.function.LongUnaryOperator; import java.util.function.ToLongFunction; public class CqPositionsOperator implements CqExpression.ObjectVisitor { @@ -17,6 +19,10 @@ public class CqPositionsOperator implements CqExpression.ObjectVisitor this.operator = idx -> operator.applyAsLong(query.at(idx)); } + public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) { + this.operator = idx -> operator.applyAsLong(query.at(idx)); + } + @Override public LongSet onAnd(List parts) { LongSet ret = new LongArraySet(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index f0ad172f..9052345a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -1,38 +1,34 @@ package nu.marginalia.api.searchquery.model.results; -import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; import lombok.ToString; - -import java.util.Map; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; @ToString public class ResultRankingContext { private final int docCount; public final ResultRankingParameters params; - private final Object2IntOpenHashMap fullCounts = new Object2IntOpenHashMap<>(10, 0.5f); - private final Object2IntOpenHashMap priorityCounts = new Object2IntOpenHashMap<>(10, 0.5f); + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt fullCounts; + + /** CqDataInt associated with frequency information of the terms in the query + * in the full index. The dataset is indexed by the compiled query. */ + public final CqDataInt priorityCounts; public ResultRankingContext(int docCount, ResultRankingParameters params, - Map fullCounts, - Map prioCounts - ) { + CqDataInt fullCounts, + CqDataInt prioCounts) + { this.docCount = docCount; this.params = params; - this.fullCounts.putAll(fullCounts); - this.priorityCounts.putAll(prioCounts); + this.fullCounts = fullCounts; + this.priorityCounts = prioCounts; } public int termFreqDocCount() { return docCount; } - public int frequency(String keyword) { - return fullCounts.getOrDefault(keyword, 1); - } - - public int priorityFrequency(String keyword) { - return priorityCounts.getOrDefault(keyword, 1); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index 8f50c9fb..7cd95b96 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -15,15 +15,24 @@ public class SearchResultItem implements Comparable { * probably not what you want, use getDocumentId() instead */ public final long combinedId; + /** Encoded document metadata */ + public final long encodedDocMetadata; + + /** Encoded html features of document */ + + public final int htmlFeatures; + /** How did the subqueries match against the document ? */ public final List keywordScores; /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId) { + public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { this.combinedId = combinedId; + this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); + this.htmlFeatures = htmlFeatures; } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index a0fd2156..212b2302 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -2,7 +2,6 @@ package nu.marginalia.api.searchquery.model.results; import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.idx.DocumentMetadata; import java.util.Objects; @@ -10,34 +9,20 @@ public final class SearchResultKeywordScore { public final long termId; public final String keyword; private final long encodedWordMetadata; - private final long encodedDocMetadata; - - private final int htmlFeatures; public SearchResultKeywordScore(String keyword, long termId, - long encodedWordMetadata, - long encodedDocMetadata, - int htmlFeatures) { + long encodedWordMetadata) { this.termId = termId; this.keyword = keyword; this.encodedWordMetadata = encodedWordMetadata; - this.encodedDocMetadata = encodedDocMetadata; - this.htmlFeatures = htmlFeatures; } public boolean hasTermFlag(WordFlags flag) { return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); } - public int positionCount() { - return Long.bitCount(positions()); - } - @Deprecated // FIXME 2024-04-06 - public int subquery() { - return -1; - } public long positions() { return WordMetadata.decodePositions(encodedWordMetadata); } @@ -46,44 +31,28 @@ public final class SearchResultKeywordScore { return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } - public boolean isKeywordRegular() { - return !keyword.contains(":") - && !hasTermFlag(WordFlags.Synthetic); - } - public long encodedWordMetadata() { return encodedWordMetadata; } - public long encodedDocMetadata() { - return encodedDocMetadata; - } - - public int htmlFeatures() { - return htmlFeatures; - } - @Override public boolean equals(Object obj) { if (obj == this) return true; if (obj == null || obj.getClass() != this.getClass()) return false; var that = (SearchResultKeywordScore) obj; - return Objects.equals(this.keyword, that.keyword) && - this.encodedWordMetadata == that.encodedWordMetadata && - this.encodedDocMetadata == that.encodedDocMetadata; + return Objects.equals(this.termId, that.termId); } @Override public int hashCode() { - return Objects.hash(keyword, encodedWordMetadata, encodedDocMetadata); + return Objects.hash(termId); } @Override public String toString() { return "SearchResultKeywordScore[" + "keyword=" + keyword + ", " + - "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ", " + - "encodedDocMetadata=" + new DocumentMetadata(encodedDocMetadata) + ']'; + "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']'; } } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index df25c494..3094699b 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -98,16 +98,16 @@ message RpcDecoratedResultItem { message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present int32 resultsFromDomain = 2; // number of other results from the same domain - repeated RpcResultKeywordScore keywordScores = 3; + int64 encodedDocMetadata = 3; // bit encoded document metadata + int32 htmlFeatures = 4; // bitmask encoding features of the document + repeated RpcResultKeywordScore keywordScores = 5; } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int64 encodedWordMetadata = 2; // bit encoded word metadata - int64 encodedDocMetadata = 3; // bit encoded document metadata - bool hasPriorityTerms = 4; // true if this word is important to the document - int32 htmlFeatures = 5; // bit encoded document features + bool hasPriorityTerms = 3; // true if this word is important to the document } /* Query execution parameters */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 36b611ff..fa0a8343 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -11,6 +11,7 @@ import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.*; import nu.marginalia.array.buffer.LongQueryBuffer; @@ -135,14 +136,14 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var rawItem = RpcRawResultItem.newBuilder(); rawItem.setCombinedId(rawResult.combinedId); rawItem.setResultsFromDomain(rawResult.resultsFromDomain); + rawItem.setHtmlFeatures(rawResult.htmlFeatures); + rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( RpcResultKeywordScore.newBuilder() - .setEncodedDocMetadata(score.encodedDocMetadata()) .setEncodedWordMetadata(score.encodedWordMetadata()) .setKeyword(score.keyword) - .setHtmlFeatures(score.htmlFeatures()) ); } @@ -203,9 +204,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, - params.compiledQuery, - params.compiledQueryIds); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -414,22 +413,22 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, - CompiledQuery query, CompiledQueryLong compiledQueryIds) { - Map termToId = new HashMap<>(query.size()); - query.indices().forEach(id -> termToId.put(query.at(id), compiledQueryIds.at(id))); - final Map termFrequencies = new HashMap<>(termToId.size()); - final Map prioFrequencies = new HashMap<>(termToId.size()); + int[] full = new int[compiledQueryIds.size()]; + int[] prio = new int[compiledQueryIds.size()]; - termToId.forEach((key, id) -> termFrequencies.put(key, index.getTermFrequency(id))); - termToId.forEach((key, id) -> prioFrequencies.put(key, index.getTermFrequencyPrio(id))); + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { + long id = compiledQueryIds.at(idx); + full[idx] = index.getTermFrequency(id); + prio[idx] = index.getTermFrequencyPrio(id); + } return new ResultRankingContext(index.getTotalDocCount(), rankingParams, - termFrequencies, - prioFrequencies); + new CqDataInt(full), + new CqDataInt(prio)); } } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 3777cf4f..89b4c543 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -1,7 +1,6 @@ package nu.marginalia.index.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; @@ -70,39 +69,42 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - SearchResultItem searchResult = new SearchResultItem(docId); + SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures); + + long[] wordMetas = new long[compiledQuery.size()]; + SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; + + for (int i = 0; i < wordMetas.length; i++) { + final long termId = compiledQueryIds.at(i); + final String term = compiledQuery.at(i); + + wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId); + scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]); + } - SearchResultKeywordScore[] scores = compiledQuery.indices().mapToObj(idx -> - new SearchResultKeywordScore( - compiledQuery.at(idx), - compiledQueryIds.at(idx), - termMetadataForCombinedDocumentIds.getTermMetadata( - compiledQueryIds.at(idx), combinedId - ), - docMetadata, - htmlFeatures) - ) - .toArray(SearchResultKeywordScore[]::new); // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs // to be able to re-construct its own CompiledQuery for re-ranking the results. This is // a very flimsy assumption. searchResult.keywordScores.addAll(List.of(scores)); - CompiledQuery queryGraphScores = new CompiledQuery<>(compiledQuery.root, scores); + CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(queryGraphScores, score -> !score.hasTermFlag(WordFlags.Synthetic)); - int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, score -> Long.bitCount(score.encodedWordMetadata() & flagsFilterMask)); - int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(queryGraphScores, SearchResultKeywordScore::positionCount); + boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent); + int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); + int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - if (!meetsQueryStrategyRequirements(queryGraphScores, queryParams.queryStrategy())) { + if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { return null; } if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; - double score = searchResultValuator.calculateSearchResultValue(queryGraphScores, + double score = searchResultValuator.calculateSearchResultValue( + wordMetasQuery, + docMetadata, + htmlFeatures, 5000, // use a dummy value here as it's not present in the index rankingContext); @@ -111,7 +113,7 @@ public class IndexResultValuationContext { return searchResult; } - private boolean meetsQueryStrategyRequirements(CompiledQuery queryGraphScores, + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.AUTO || @@ -124,24 +126,24 @@ public class IndexResultValuationContext { docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); } - private boolean meetsQueryStrategyRequirements(SearchResultKeywordScore termScore, QueryStrategy queryStrategy) { + private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Site.asBit()); + return WordFlags.Site.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Subjects.asBit()); + return WordFlags.Subjects.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.Title.asBit()); + return WordFlags.Title.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlPath.asBit()); + return WordFlags.UrlPath.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.UrlDomain.asBit()); + return WordFlags.UrlDomain.isPresent(wordMeta); } else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordMetadata.hasFlags(termScore.encodedWordMetadata(), WordFlags.ExternalLink.asBit()); + return WordFlags.ExternalLink.isPresent(wordMeta); } return true; } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java index a84e5f4f..2fa44c31 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java @@ -6,16 +6,19 @@ import gnu.trove.list.TLongList; import gnu.trove.list.array.TLongArrayList; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.ranking.results.ResultValuator; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -126,22 +129,31 @@ public class IndexResultValuatorService { continue; } - // Reconstruct the SearchResultKeywordScore-compiledquery for re-valuation + // Reconstruct the compiledquery for re-valuation // // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same // order as the data for the CompiledQuery. - CompiledQuery resultQuery = - new CompiledQuery<>(compiledQuery.root, result.keywordScores.toArray(SearchResultKeywordScore[]::new)); + long[] wordMetas = new long[compiledQuery.size()]; + for (int i = 0; i < compiledQuery.size(); i++) { + var score = result.keywordScores.get(i); + wordMetas[i] = score.encodedWordMetadata(); + } - resultItems.add(createCombinedItem(result, docData, resultQuery, rankingContext)); + CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); + + resultItems.add(createCombinedItem( + result, + docData, + metaQuery, + rankingContext)); } return resultItems; } private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, DocdbUrlDetail docData, - CompiledQuery resultQuery, + CompiledQueryLong wordMetas, ResultRankingContext rankingContext) { return new DecoratedSearchResultItem( result, @@ -154,13 +166,19 @@ public class IndexResultValuatorService { docData.pubYear(), docData.dataHash(), docData.wordsTotal(), - bestPositions(resultQuery), - resultValuator.calculateSearchResultValue(resultQuery, docData.wordsTotal(), rankingContext) + bestPositions(wordMetas), + + resultValuator.calculateSearchResultValue(wordMetas, + result.encodedDocMetadata, + result.htmlFeatures, + docData.wordsTotal(), + rankingContext) ); } - private long bestPositions(CompiledQuery resultQuery) { - LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(resultQuery, SearchResultKeywordScore::positions); + private long bestPositions(CompiledQueryLong wordMetas) { + LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); + int bestPc = 0; long bestPositions = 0; diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 862978c9..4d257349 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -1,9 +1,8 @@ package nu.marginalia.ranking.results; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentFlags; @@ -15,36 +14,32 @@ import com.google.inject.Singleton; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; -import java.util.List; - @Singleton public class ResultValuator { final static double scalingFactor = 500.; - private final Bm25Factor bm25Factor; private final TermCoherenceFactor termCoherenceFactor; private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); @Inject - public ResultValuator(Bm25Factor bm25Factor, - TermCoherenceFactor termCoherenceFactor) { - this.bm25Factor = bm25Factor; + public ResultValuator(TermCoherenceFactor termCoherenceFactor) { this.termCoherenceFactor = termCoherenceFactor; } - public double calculateSearchResultValue(CompiledQuery scores, + public double calculateSearchResultValue(CompiledQueryLong wordMeta, + long documentMetadata, + int features, int length, ResultRankingContext ctx) { - if (scores.size() == 0) + if (wordMeta.isEmpty()) return Double.MAX_VALUE; - if (length < 0) - length = 5000; - long documentMetadata = scores.at(0).encodedDocMetadata(); - int features = scores.at(0).htmlFeatures(); + if (length < 0) { + length = 5000; + } + var rankingParams = ctx.params; int rank = DocumentMetadata.decodeRank(documentMetadata); @@ -79,9 +74,10 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(scores); - double bestBM25F = rankingParams.bm25FullWeight * bm25Factor.calculateBm25(rankingParams.prioParams, scores, length, ctx); - double bestBM25P = rankingParams.bm25PrioWeight * bm25Factor.calculateBm25Prio(rankingParams.prioParams, scores, ctx); + double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta); + + double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java deleted file mode 100644 index bc13671e..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25Factor.java +++ /dev/null @@ -1,113 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.Bm25Parameters; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.model.idx.WordFlags; - -public class Bm25Factor { - private static final int AVG_LENGTH = 5000; - - /** This is an estimation of BM-25. - * - * @see Bm25Parameters - */ - public double calculateBm25(Bm25Parameters bm25Parameters, CompiledQuery scores, int length, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = keyword.positionCount(); - - int freq = ctx.frequency(keyword.keyword); - - return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - }); - } - - /** Bm25 calculation, except instead of counting positions in the document, - * the number of relevance signals for the term is counted instead. - */ - public double calculateBm25Prio(Bm25Parameters bm25Parameters, CompiledQuery scores, ResultRankingContext ctx) { - final int docCount = ctx.termFreqDocCount(); - - return CompiledQueryAggregates.doubleSumAggregate(scores, keyword -> { - double count = evaluatePriorityScore(keyword); - - int freq = ctx.priorityFrequency(keyword.keyword); - - // note we override b to zero for priority terms as they are independent of document length - return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); - }); - - } - - private static double evaluatePriorityScore(SearchResultKeywordScore keyword) { - int pcount = keyword.positionCount(); - - double qcount = 0.; - - if ((keyword.encodedWordMetadata() & WordFlags.ExternalLink.asBit()) != 0) { - - qcount += 2.5; - - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 2.5; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1.5; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 1.25; - } - else { - if ((keyword.encodedWordMetadata() & WordFlags.UrlDomain.asBit()) != 0) - qcount += 3; - else if ((keyword.encodedWordMetadata() & WordFlags.UrlPath.asBit()) != 0) - qcount += 1; - - if ((keyword.encodedWordMetadata() & WordFlags.Site.asBit()) != 0) - qcount += 0.5; - if ((keyword.encodedWordMetadata() & WordFlags.SiteAdjacent.asBit()) != 0) - qcount += 0.5; - } - - if ((keyword.encodedWordMetadata() & WordFlags.Title.asBit()) != 0) - qcount += 1.5; - - if (pcount > 2) { - if ((keyword.encodedWordMetadata() & WordFlags.Subjects.asBit()) != 0) - qcount += 1.25; - if ((keyword.encodedWordMetadata() & WordFlags.NamesWords.asBit()) != 0) - qcount += 0.25; - if ((keyword.encodedWordMetadata() & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; - } - - return qcount; - } - - /** - * - * @param docCount Number of documents - * @param freq Number of matching documents - */ - private double invFreq(int docCount, int freq) { - return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); - } - - /** - * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization - * @param count number of occurrences in the document - * @param length document length - */ - private double f(double k, double b, double count, int length) { - final double lengthRatio = (double) length / AVG_LENGTH; - - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java new file mode 100644 index 00000000..9c46261d --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java @@ -0,0 +1,81 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + private final int length; + + public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + int length, + ResultRankingContext ctx) { + this.length = length; + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); + + int freq = frequencies.get(idx); + + return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); + } + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java new file mode 100644 index 00000000..1fb26f6b --- /dev/null +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java @@ -0,0 +1,127 @@ +package nu.marginalia.ranking.results.factors; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.WordMetadata; + +import java.util.List; + +public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final CqDataLong wordMetaData; + private final CqDataInt frequencies; + private final Bm25Parameters bm25Parameters; + + private final int docCount; + + public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + ResultRankingContext ctx) { + this.bm25Parameters = bm25Parameters; + this.docCount = ctx.termFreqDocCount(); + this.wordMetaData = wordMetaData; + this.frequencies = ctx.fullCounts; + } + + @Override + public double onAnd(List parts) { + double value = 0; + for (var part : parts) { + value += part.visit(this); + } + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + double count = evaluatePriorityScore(wordMetaData.get(idx)); + + int freq = frequencies.get(idx); + + // note we override b to zero for priority terms as they are independent of document length + return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); + } + + private static double evaluatePriorityScore(long wordMeta) { + int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta)); + + double qcount = 0.; + + if ((wordMeta & WordFlags.ExternalLink.asBit()) != 0) { + + qcount += 2.5; + + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 2.5; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1.5; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 1.25; + } + else { + if ((wordMeta & WordFlags.UrlDomain.asBit()) != 0) + qcount += 3; + else if ((wordMeta & WordFlags.UrlPath.asBit()) != 0) + qcount += 1; + + if ((wordMeta & WordFlags.Site.asBit()) != 0) + qcount += 0.5; + if ((wordMeta & WordFlags.SiteAdjacent.asBit()) != 0) + qcount += 0.5; + } + + if ((wordMeta & WordFlags.Title.asBit()) != 0) + qcount += 1.5; + + if (pcount > 2) { + if ((wordMeta & WordFlags.Subjects.asBit()) != 0) + qcount += 1.25; + if ((wordMeta & WordFlags.NamesWords.asBit()) != 0) + qcount += 0.25; + if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0) + qcount += 0.5; + } + + return qcount; + } + + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param k determines the size of the impact of a single term + * @param b determines the magnitude of the length normalization + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double k, double b, double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index 71159c58..e617549d 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -1,16 +1,16 @@ package nu.marginalia.ranking.results.factors; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; import nu.marginalia.model.idx.WordMetadata; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(CompiledQuery scores) { - long mask = CompiledQueryAggregates.longBitmaskAggregate(scores, score -> score.positions() & WordMetadata.POSITIONS_MASK); + public double calculate(CompiledQueryLong wordMetadataQuery) { + long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, + score -> score >>> WordMetadata.POSITIONS_SHIFT); return bitsSetFactor(mask); } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 301b5e19..7b0a6a24 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -215,9 +215,7 @@ public class IndexQueryServiceIntegrationSmokeTest { Set years = new HashSet<>(); for (var res : rsp.results) { - for (var score : res.rawIndexResult.getKeywordScores()) { - years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata())); - } + years.add(DocumentMetadata.decodeYear(res.rawIndexResult.encodedDocMetadata)); } assertEquals(Set.of(1998), years); diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 948c5857..c605a0a8 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), List.of(), 4, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index 243ae90d..a1b66b04 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -1,6 +1,8 @@ package nu.marginalia.ranking.results; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; @@ -31,30 +33,27 @@ class ResultValuatorTest { when(dict.docCount()).thenReturn(100_000); valuator = new ResultValuator( - new Bm25Factor(), new TermCoherenceFactor() ); } - CompiledQuery titleOnlyLowCountSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountNoTitleSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); - CompiledQuery highCountSubjectSet = CompiledQuery.just( + CqDataInt frequencyData = new CqDataInt(new int[] { 10 }); + + CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just( new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)), - docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)), - 0) - ); + wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata); + + CompiledQueryLong highCountNoTitleSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; + + CompiledQueryLong highCountSubjectSet = CompiledQuery.just( + new SearchResultKeywordScore("bob", 1, + wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects))) + ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; @Test @@ -63,12 +62,16 @@ class ResultValuatorTest { when(dict.getTermFreq("bob")).thenReturn(10); ResultRankingContext context = new ResultRankingContext(100000, ResultRankingParameters.sensibleDefaults(), - Map.of("bob", 10), Collections.emptyMap()); + frequencyData, + frequencyData); - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, 10_000, context); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, 10_000, context); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, 10_000, context); + long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); + int features = 0; + + double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); + double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context); + double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context); System.out.println(titleOnlyLowCount); System.out.println(titleLongOnlyLowCount); diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index 028896d9..d0abe443 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -18,14 +18,23 @@ class TermCoherenceFactorTest { @Test public void testAllBitsSet() { var allPositionsSet = createSet( - WordMetadata.POSITIONS_MASK, WordMetadata.POSITIONS_MASK + ~0L, + ~0L ); - long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); + long mask = CompiledQueryAggregates.longBitmaskAggregate( + allPositionsSet, + SearchResultKeywordScore::positions + ); assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(1.0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(1.0, + termCoherenceFactor.calculate( + allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) + ) + ); + } @Test @@ -38,7 +47,7 @@ class TermCoherenceFactorTest { assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(0, termCoherenceFactor.calculate(allPositionsSet)); + assertEquals(0, termCoherenceFactor.calculate(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); } @Test @SuppressWarnings("unchecked") @@ -90,7 +99,7 @@ class TermCoherenceFactorTest { for (int i = 0; i < positionMasks.length; i++) { keywords.add(new SearchResultKeywordScore("", 0, - new WordMetadata(positionMasks[i], (byte) 0).encode(), 0, 0)); + new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode())); } return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); diff --git a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java index f5068d07..df24ec10 100644 --- a/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java +++ b/code/libraries/term-frequency-dict/test/nu/marginalia/segmentation/NgramLexiconTest.java @@ -15,7 +15,7 @@ class NgramLexiconTest { } void addNgram(String... ngram) { - lexicon.incOrdered(HasherGroup.ordered().rollingHash(ngram)); + lexicon.incOrderedTitle(HasherGroup.ordered().rollingHash(ngram)); } @Test diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index a67582bd..faba9eb7 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -38,7 +38,7 @@ public class ClusteredUrlDetails implements Comparable { for (var keywordScore : urlDetail.resultItem.keywordScores) { if (keywordScore.isKeywordSpecial()) continue; - if (keywordScore.positionCount() == 0) + if (keywordScore.positions() == 0) continue; if (keywordScore.hasTermFlag(WordFlags.Title)) From 155be1078df9d6b63bdc344aa523034878960f1f Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 15 Apr 2024 16:44:08 +0200 Subject: [PATCH 42/90] (index) Fix priority search terms This functionality fell into disrepair some while ago. It's supposed to allow non-mandatory search terms that boost the ranking if they are present in the document. --- .../api/searchquery/QueryProtobufCodec.java | 1 + .../model/results/SearchResultItem.java | 10 +++++++- .../api/src/main/protobuf/query-api.proto | 2 +- .../nu/marginalia/index/IndexGrpcService.java | 1 + .../index/results/IndexMetadataService.java | 23 +++++++++++++++++++ .../results/IndexResultValuationContext.java | 21 +++++++++++++++-- .../index/results/model/QuerySearchTerms.java | 3 +++ .../TermMetadataForCombinedDocumentIds.java | 11 ++++++++- .../IndexResultDomainDeduplicatorTest.java | 2 +- 9 files changed, 68 insertions(+), 6 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 5a43df1b..2907992d 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -138,6 +138,7 @@ public class QueryProtobufCodec { rawItem.getHtmlFeatures(), keywordScores, rawItem.getResultsFromDomain(), + rawItem.getHasPriorityTerms(), Double.NaN // Not set ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java index 7cd95b96..ad8b8cb1 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultItem.java @@ -28,11 +28,17 @@ public class SearchResultItem implements Comparable { /** How many other potential results existed in the same domain */ public int resultsFromDomain; - public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures) { + public boolean hasPrioTerm; + + public SearchResultItem(long combinedId, + long encodedDocMetadata, + int htmlFeatures, + boolean hasPrioTerm) { this.combinedId = combinedId; this.encodedDocMetadata = encodedDocMetadata; this.keywordScores = new ArrayList<>(); this.htmlFeatures = htmlFeatures; + this.hasPrioTerm = hasPrioTerm; } @@ -85,4 +91,6 @@ public class SearchResultItem implements Comparable { return Long.compare(this.combinedId, o.combinedId); } + + } diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index 3094699b..bae06e66 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -101,13 +101,13 @@ message RpcRawResultItem { int64 encodedDocMetadata = 3; // bit encoded document metadata int32 htmlFeatures = 4; // bitmask encoding features of the document repeated RpcResultKeywordScore keywordScores = 5; + bool hasPriorityTerms = 6; // true if this word is important to the document } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword int64 encodedWordMetadata = 2; // bit encoded word metadata - bool hasPriorityTerms = 3; // true if this word is important to the document } /* Query execution parameters */ diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index fa0a8343..4810d625 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -138,6 +138,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { rawItem.setResultsFromDomain(rawResult.resultsFromDomain); rawItem.setHtmlFeatures(rawResult.htmlFeatures); rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); + rawItem.setHasPriorityTerms(rawResult.hasPrioTerm); for (var score : rawResult.keywordScores) { rawItem.addKeywordScores( diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index 977a87e7..ce23c3f2 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -43,6 +43,7 @@ public class IndexMetadataService { public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { LongArrayList termIdsList = new LongArrayList(); + LongArrayList termIdsPrio = new LongArrayList(); TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); @@ -52,8 +53,30 @@ public class IndexMetadataService { termToId.put(word, id); } + for (var term : searchQuery.searchTermsAdvice) { + if (termToId.containsKey(term)) { + continue; + } + + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termToId.put(term, id); + } + + for (var term : searchQuery.searchTermsPriority) { + if (termToId.containsKey(term)) { + continue; + } + + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termIdsPrio.add(id); + termToId.put(term, id); + } + return new QuerySearchTerms(termToId, new TermIdList(termIdsList), + new TermIdList(termIdsPrio), new TermCoherenceGroupList( searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList() ) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 89b4c543..a9d6b4a6 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -52,7 +52,8 @@ public class IndexResultValuationContext { this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, searchTerms.termIdsAll); + this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, + searchTerms.termIdsAll); } private final long flagsFilterMask = @@ -69,7 +70,10 @@ public class IndexResultValuationContext { long docMetadata = statefulIndex.getDocumentMetadata(docId); int htmlFeatures = statefulIndex.getHtmlFeatures(docId); - SearchResultItem searchResult = new SearchResultItem(docId, docMetadata, htmlFeatures); + SearchResultItem searchResult = new SearchResultItem(docId, + docMetadata, + htmlFeatures, + hasPrioTerm(combinedId)); long[] wordMetas = new long[compiledQuery.size()]; SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; @@ -108,11 +112,24 @@ public class IndexResultValuationContext { 5000, // use a dummy value here as it's not present in the index rankingContext); + if (searchResult.hasPrioTerm) { + score = 0.75 * score; + } + searchResult.setScore(score); return searchResult; } + private boolean hasPrioTerm(long combinedId) { + for (var term : searchTerms.termIdsPrio.array()) { + if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) { + return true; + } + } + return false; + } + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, QueryStrategy queryStrategy) { diff --git a/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java b/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java index d72e0ea9..bbb7cf30 100644 --- a/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java +++ b/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java @@ -6,14 +6,17 @@ import nu.marginalia.index.results.model.ids.TermIdList; public class QuerySearchTerms { private final TObjectLongHashMap termToId; public final TermIdList termIdsAll; + public final TermIdList termIdsPrio; public final TermCoherenceGroupList coherences; public QuerySearchTerms(TObjectLongHashMap termToId, TermIdList termIdsAll, + TermIdList termIdsPrio, TermCoherenceGroupList coherences) { this.termToId = termToId; this.termIdsAll = termIdsAll; + this.termIdsPrio = termIdsPrio; this.coherences = coherences; } diff --git a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java index 9068dd69..3ef2f7ab 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java +++ b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java @@ -18,12 +18,21 @@ public class TermMetadataForCombinedDocumentIds { public long getTermMetadata(long termId, long combinedId) { var metaByCombinedId = termdocToMeta.get(termId); if (metaByCombinedId == null) { - logger.warn("Missing meta for term {}", termId); return 0; } return metaByCombinedId.get(combinedId); } + public boolean hasTermMeta(long termId, long combinedId) { + var metaByCombinedId = termdocToMeta.get(termId); + + if (metaByCombinedId == null) { + return false; + } + + return metaByCombinedId.get(combinedId) != 0; + } + public record DocumentsWithMetadata(Long2LongOpenHashMap data) { public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) { this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array())); diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index c605a0a8..21f6312e 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN, false); } } \ No newline at end of file From fce26015c9c783bc8c154d7e85cc0c27edd927fa Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 12:10:13 +0200 Subject: [PATCH 43/90] (encyclopedia) Index the full articles Previously, in an experimental change, only the first paragraph was indexed, intended to reduce the amount of noisy tangential hits. This was not a good idea, so the change is reverted. --- .../encyclopedia/EncyclopediaMarginaliaNuSideloader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index ca85455e..17c83250 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -125,7 +125,6 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC fullHtml.append("

    "); fullHtml.append(part); fullHtml.append("

    "); - break; // Only take the first part, this improves accuracy a lot } fullHtml.append(""); From 08416393e0fa958bc9ef146bbc32abb2ef5425af Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 17:15:21 +0200 Subject: [PATCH 44/90] (valuation) Impose stronger constraints on locality of terms --- .../results/IndexResultValuationContext.java | 47 +++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index a9d6b4a6..840b6253 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -97,12 +97,13 @@ public class IndexResultValuationContext { boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent); int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); + boolean noneOverlap = wordMetasQuery.root.visit(new PositionOverlapOperator(wordMetasQuery.data)) != 0; if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { return null; } - if (flagsCount == 0 && !allSynthetic && positionsCount == 0) + if (flagsCount == 0 && !allSynthetic && (positionsCount == 0 || noneOverlap)) return null; double score = searchResultValuator.calculateSearchResultValue( @@ -165,6 +166,46 @@ public class IndexResultValuationContext { return true; } - - +} + +class PositionOverlapOperator implements CqExpression.LongVisitor { + private final CqDataLong wordMetaData; + + PositionOverlapOperator(CqDataLong wordMetaData) { + this.wordMetaData = wordMetaData; + } + + @Override + public long onAnd(List parts) { + long positions = ~0; + long flags = 0; + + for (var part : parts) { + long pv = part.visit(this); + if ((pv & WordMetadata.FLAGS_MASK) != 0) { + flags |= (pv & WordMetadata.FLAGS_MASK); + } + else { + positions &= pv; + } + } + + return positions | flags; + } + + @Override + public long onOr(List parts) { + long ret = 0; + + for (var part : parts) { + ret |= part.visit(this); + } + + return ret; + } + + @Override + public long onLeaf(int idx) { + return wordMetaData.get(idx); + } } From 1748fcc5ac5b7f47bc24d8b779b3451f53491a87 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 17:22:58 +0200 Subject: [PATCH 45/90] (valuation) Impose stronger constraints on locality of terms Clean up logic a bit --- .../index/results/IndexResultValuationContext.java | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 840b6253..5383cbb9 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -182,15 +182,12 @@ class PositionOverlapOperator implements CqExpression.LongVisitor { for (var part : parts) { long pv = part.visit(this); - if ((pv & WordMetadata.FLAGS_MASK) != 0) { - flags |= (pv & WordMetadata.FLAGS_MASK); - } - else { - positions &= pv; - } + + flags |= pv; + positions &= pv; } - return positions | flags; + return positions | (flags & WordMetadata.FLAGS_MASK); } @Override From adf846bfd2789c4d2966a95f1ceb8905d7171491 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 18:07:43 +0200 Subject: [PATCH 46/90] (index) Fix term coherence evaluation The code was incorrectly using the documentId instead of the combined id, resulting in almost all result sets being incorrectly seen as zero. --- .../index/results/IndexResultValuationContext.java | 2 +- .../index/results/model/TermCoherenceGroupList.java | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 5383cbb9..3a7f157b 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -64,7 +64,7 @@ public class IndexResultValuationContext { long docId = UrlIdCodec.removeRank(combinedId); - if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, docId)) + if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId)) return null; long docMetadata = statefulIndex.getDocumentMetadata(docId); diff --git a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java index 2b6c24f5..4b119c60 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java +++ b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java @@ -15,9 +15,9 @@ public record TermCoherenceGroupList(List words) { this.words = Collections.unmodifiableList(words); } - public boolean test(TermMetadataForCombinedDocumentIds documents, long docId) { + public boolean test(TermMetadataForCombinedDocumentIds documents, long combinedId) { for (var coherenceSet : words()) { - if (!coherenceSet.test(documents, docId)) { + if (!coherenceSet.test(documents, combinedId)) { return false; } } @@ -36,11 +36,11 @@ public record TermCoherenceGroupList(List words) { this(coh.stream().mapToLong(SearchTermsUtil::getWordId).toArray()); } - public boolean test(TermMetadataForCombinedDocumentIds documents, long docId) { + public boolean test(TermMetadataForCombinedDocumentIds documents, long combinedId) { long overlap = 0xFF_FFFF_FFFF_FFFFL; for (var word : words) { - overlap &= documents.getTermMetadata(word, docId); + overlap &= documents.getTermMetadata(word, combinedId); } return WordMetadata.decodePositions(overlap) != 0L; From df75e8f4aad93110a79cdbbaf27359f81b6451d3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 19:23:00 +0200 Subject: [PATCH 47/90] (index) Explicitly free LongQueryBuffers --- code/index/java/nu/marginalia/index/IndexGrpcService.java | 3 ++- .../java/nu/marginalia/array/buffer/LongQueryBuffer.java | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 4810d625..3eb2f5d7 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -9,7 +9,6 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.longs.LongArrayList; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; @@ -335,6 +334,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } } + buffer.dispose(); + if (!results.isEmpty()) { enqueueResults(new CombinedDocIdList(results)); } diff --git a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java index a0312d36..eaabb4a5 100644 --- a/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java @@ -51,6 +51,11 @@ public class LongQueryBuffer { return copy; } + /** Dispose of the buffer and release resources */ + public void dispose() { + data.close(); + } + public boolean isEmpty() { return end == 0; } From 1bb88968c5bb7bafd2195820bf32512466075ccf Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 19:44:51 +0200 Subject: [PATCH 48/90] (test) Fix broken test --- .../index/results/IndexResultDomainDeduplicatorTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index 21f6312e..f4740e31 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, Double.NaN, false); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, false, Double.NaN); } } \ No newline at end of file From c620e9c0260eabb772fa32804f9de00c17159d43 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 19:43:14 +0200 Subject: [PATCH 49/90] (index) Experimental performance regression fix --- .../marginalia/index/index/StatefulIndex.java | 39 +++++++++---------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index ae7b1353..5c54a15b 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -2,8 +2,7 @@ package nu.marginalia.index.index; import com.google.inject.Inject; import com.google.inject.Singleton; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; -import it.unimi.dsi.fastutil.longs.LongSet; +import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import nu.marginalia.index.query.filter.QueryFilterAllOf; import nu.marginalia.index.query.filter.QueryFilterAnyOf; @@ -125,28 +124,26 @@ public class StatefulIndex { // the term is missing from the index and can never be found paths.removeIf(containsAll(termPriority).negate()); - List walkers = QueryBranchWalker.create(termPriority, paths); + for (var path : paths) { + LongList elements = new LongArrayList(path); - for (var walker : walkers) { - for (var builder : List.of( - combinedIndexReader.findPriorityWord(walker.termId), - combinedIndexReader.findFullWord(walker.termId) - )) - { - queryHeads.add(builder); - - if (walker.atEnd()) - continue; // Single term search query - - // Add filter steps for the remaining combinations of terms - List filterSteps = new ArrayList<>(); - for (var step : walker.next()) { - filterSteps.add(createFilter(step, 0)); + elements.sort((a, b) -> { + for (int i = 0; i < termPriority.length; i++) { + if (termPriority[i] == a) + return -1; + if (termPriority[i] == b) + return 1; } - builder.addInclusionFilterAny(filterSteps); - } - } + return 0; + }); + var head = combinedIndexReader.findFullWord(elements.getLong(0)); + for (int i = 1; i < elements.size(); i++) { + head.addInclusionFilter(combinedIndexReader.hasWordFull(elements.getLong(i))); + } + + queryHeads.add(head); + } // Add additional conditions to the query heads for (var query : queryHeads) { From 44c1e1d6d95ed63a299c92d4c408174efc2b5c99 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 19:59:27 +0200 Subject: [PATCH 50/90] (index) Remove dead code Since the performance fix in 3359f7223951fba16bf997093aa0a61d8ab2a3e1 had a huge positive impact without reducing result quality, it's possible to remove the QueryBranchWalker and associated code. --- .../index/index/CombinedIndexReader.java | 13 +-- .../index/index/IndexQueryBuilderImpl.java | 3 - .../index/index/QueryBranchWalker.java | 108 ------------------ .../marginalia/index/index/StatefulIndex.java | 45 -------- .../index/index/QueryBranchWalkerTest.java | 59 ---------- 5 files changed, 1 insertion(+), 227 deletions(-) delete mode 100644 code/index/java/nu/marginalia/index/index/QueryBranchWalker.java delete mode 100644 code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 27a631f5..ba6748fc 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -35,24 +35,13 @@ public class CombinedIndexReader { } public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) { - return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); + return new IndexQueryBuilderImpl(reverseIndexFullReader, query); } public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } - public QueryFilterStepIf hasWordPrio(long termId) { - return reverseIndexPriorityReader.also(termId); - } - - - /** Creates a query builder for terms in the priority index */ - public IndexQueryBuilder findPriorityWord(long wordId) { - return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId))) - .withSourceTerms(wordId); - } - /** Creates a query builder for terms in the full index */ public IndexQueryBuilder findFullWord(long wordId) { return newQueryBuilder( diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 0f63fdbc..92dce62a 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -11,7 +11,6 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { private final IndexQuery query; private final ReverseIndexReader reverseIndexFullReader; - private final ReverseIndexReader reverseIndexPrioReader; /* Keep track of already added include terms to avoid redundant checks. * @@ -22,12 +21,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { private final TLongHashSet alreadyConsideredTerms = new TLongHashSet(); IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader, - ReverseIndexReader reverseIndexPrioReader, IndexQuery query) { this.query = query; this.reverseIndexFullReader = reverseIndexFullReader; - this.reverseIndexPrioReader = reverseIndexPrioReader; } public IndexQueryBuilder withSourceTerms(long... sourceTerms) { diff --git a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java b/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java deleted file mode 100644 index ffaa5176..00000000 --- a/code/index/java/nu/marginalia/index/index/QueryBranchWalker.java +++ /dev/null @@ -1,108 +0,0 @@ -package nu.marginalia.index.index; - -import it.unimi.dsi.fastutil.longs.LongArrayList; -import it.unimi.dsi.fastutil.longs.LongArraySet; -import it.unimi.dsi.fastutil.longs.LongSet; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; - -/** Helper class for index query construction */ -public class QueryBranchWalker { - private static final Logger logger = LoggerFactory.getLogger(QueryBranchWalker.class); - public final long[] priorityOrder; - public final List paths; - public final long termId; - - private QueryBranchWalker(long[] priorityOrder, List paths, long termId) { - this.priorityOrder = priorityOrder; - this.paths = paths; - this.termId = termId; - } - - public boolean atEnd() { - return priorityOrder.length == 0; - } - - /** Group the provided paths by the lowest termId they contain per the provided priorityOrder, - * into a list of QueryBranchWalkers. This can be performed iteratively on the resultant QBW:s - * to traverse the tree via the next() method. - *

    - * The paths can be extracted through the {@link nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates CompiledQueryAggregates} - * queriesAggregate method. - */ - public static List create(long[] priorityOrder, List paths) { - if (paths.isEmpty()) - return List.of(); - - List ret = new ArrayList<>(); - List remainingPaths = new LinkedList<>(paths); - remainingPaths.removeIf(LongSet::isEmpty); - - List pathsForPrio = new ArrayList<>(); - - for (int i = 0; i < priorityOrder.length; i++) { - long termId = priorityOrder[i]; - - var it = remainingPaths.iterator(); - - while (it.hasNext()) { - var path = it.next(); - - if (path.contains(termId)) { - // Remove the current termId from the path - path.remove(termId); - - // Add it to the set of paths associated with the termId - pathsForPrio.add(path); - - // Remove it from consideration - it.remove(); - } - } - - if (!pathsForPrio.isEmpty()) { - long[] newPrios = keepRelevantPriorities(priorityOrder, pathsForPrio); - ret.add(new QueryBranchWalker(newPrios, new ArrayList<>(pathsForPrio), termId)); - pathsForPrio.clear(); - } - } - - // This happens if the priorityOrder array doesn't contain all items in the paths, - // in practice only when an index doesn't contain all the search terms, so we can just - // skip those paths - if (!remainingPaths.isEmpty()) { - logger.debug("Dropping: {}", remainingPaths); - } - - return ret; - } - - /** From the provided priorityOrder array, keep the elements that are present in any set in paths */ - private static long[] keepRelevantPriorities(long[] priorityOrder, List paths) { - LongArrayList remainingPrios = new LongArrayList(paths.size()); - - // these sets are typically very small so array set is a good choice - LongSet allElements = new LongArraySet(priorityOrder.length); - for (var path : paths) { - allElements.addAll(path); - } - - for (var p : priorityOrder) { - if (allElements.contains(p)) - remainingPrios.add(p); - } - - return remainingPrios.elements(); - } - - /** Convenience method that applies the create() method - * to the priority order and paths associated with this instance */ - public List next() { - return create(priorityOrder, paths); - } - -} diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 5c54a15b..dcaf5d7a 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -4,9 +4,6 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.index.query.filter.QueryFilterAllOf; -import nu.marginalia.index.query.filter.QueryFilterAnyOf; -import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.DocMetadataList; import nu.marginalia.index.model.QueryParams; @@ -168,48 +165,6 @@ public class StatefulIndex { .toList(); } - /** Recursively create a filter step based on the QBW and its children */ - private QueryFilterStepIf createFilter(QueryBranchWalker walker, int depth) { - - // Create a filter for the current termId - final QueryFilterStepIf ownFilterCondition = ownFilterCondition(walker, depth); - - var childSteps = walker.next(); - if (childSteps.isEmpty()) // no children, and so we're satisfied with just a single filter condition - return ownFilterCondition; - - // If there are children, we append the filter conditions for each child as an anyOf condition - // to the current filter condition - - List combinedFilters = new ArrayList<>(); - - for (var step : childSteps) { - // Recursion will be limited to a fairly shallow stack depth due to how the queries are constructed. - var childFilter = createFilter(step, depth+1); - combinedFilters.add(new QueryFilterAllOf(ownFilterCondition, childFilter)); - } - - // Flatten the filter conditions if there's only one branch - if (combinedFilters.size() == 1) - return combinedFilters.getFirst(); - else - return new QueryFilterAnyOf(combinedFilters); - } - - /** Create a filter condition based on the termId associated with the QBW */ - private QueryFilterStepIf ownFilterCondition(QueryBranchWalker walker, int depth) { - if (depth < 2) { - // At shallow depths we prioritize terms that appear in the priority index, - // to increase the odds we find "good" results before the execution timer runs out - return new QueryFilterAnyOf( - combinedIndexReader.hasWordPrio(walker.termId), - combinedIndexReader.hasWordFull(walker.termId) - ); - } else { - return combinedIndexReader.hasWordFull(walker.termId); - } - } - private Predicate containsAll(long[] permitted) { LongSet permittedTerms = new LongOpenHashSet(permitted); return permittedTerms::containsAll; diff --git a/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java b/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java deleted file mode 100644 index 8d2f45c8..00000000 --- a/code/index/test/nu/marginalia/index/index/QueryBranchWalkerTest.java +++ /dev/null @@ -1,59 +0,0 @@ -package nu.marginalia.index.index; - -import it.unimi.dsi.fastutil.longs.LongArraySet; -import it.unimi.dsi.fastutil.longs.LongSet; -import org.junit.jupiter.api.Test; - -import java.util.Arrays; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.*; - -class QueryBranchWalkerTest { - @Test - public void testNoOverlap() { - var paths = QueryBranchWalker.create( - new long[] { 1, 2 }, - List.of(set(1), set(2)) - ); - assertEquals(2, paths.size()); - assertEquals(Set.of(1L, 2L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); - } - - @Test - public void testCond() { - var paths = QueryBranchWalker.create( - new long[] { 1, 2, 3, 4 }, - List.of(set(1,2,3), set(1,4,3)) - ); - assertEquals(1, paths.size()); - assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); - System.out.println(Arrays.toString(paths.getFirst().priorityOrder)); - assertArrayEquals(new long[] { 2, 3, 4 }, paths.getFirst().priorityOrder); - - var next = paths.getFirst().next(); - assertEquals(2, next.size()); - assertEquals(Set.of(2L, 3L), next.stream().map(path -> path.termId).collect(Collectors.toSet())); - Map byId = next.stream().collect(Collectors.toMap(w -> w.termId, w->w)); - assertArrayEquals(new long[] { 3L }, byId.get(2L).priorityOrder ); - assertArrayEquals(new long[] { 4L }, byId.get(3L).priorityOrder ); - } - - @Test - public void testNoOverlapFirst() { - var paths = QueryBranchWalker.create( - new long[] { 1, 2, 3 }, - List.of(set(1, 2), set(1, 3)) - ); - assertEquals(1, paths.size()); - assertArrayEquals(new long[] { 2, 3 }, paths.getFirst().priorityOrder); - assertEquals(Set.of(1L), paths.stream().map(path -> path.termId).collect(Collectors.toSet())); - } - - LongSet set(long... args) { - return new LongArraySet(args); - } -} \ No newline at end of file From e0224085b42614fe9eb1f3b84f387faec27ca4b3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 16 Apr 2024 22:51:03 +0200 Subject: [PATCH 51/90] (index) Improve recall for small queries Partially reverse the previous commit and add a query head for the priority index when there are few query interpretations. --- .../marginalia/index/index/CombinedIndexReader.java | 13 ++++++++++++- .../index/index/IndexQueryBuilderImpl.java | 3 +++ .../nu/marginalia/index/index/StatefulIndex.java | 13 ++++++++++++- 3 files changed, 27 insertions(+), 2 deletions(-) diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index ba6748fc..27a631f5 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -35,13 +35,24 @@ public class CombinedIndexReader { } public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) { - return new IndexQueryBuilderImpl(reverseIndexFullReader, query); + return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); } public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } + public QueryFilterStepIf hasWordPrio(long termId) { + return reverseIndexPriorityReader.also(termId); + } + + + /** Creates a query builder for terms in the priority index */ + public IndexQueryBuilder findPriorityWord(long wordId) { + return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId))) + .withSourceTerms(wordId); + } + /** Creates a query builder for terms in the full index */ public IndexQueryBuilder findFullWord(long wordId) { return newQueryBuilder( diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 92dce62a..0f63fdbc 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -11,6 +11,7 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { private final IndexQuery query; private final ReverseIndexReader reverseIndexFullReader; + private final ReverseIndexReader reverseIndexPrioReader; /* Keep track of already added include terms to avoid redundant checks. * @@ -21,10 +22,12 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { private final TLongHashSet alreadyConsideredTerms = new TLongHashSet(); IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader, + ReverseIndexReader reverseIndexPrioReader, IndexQuery query) { this.query = query; this.reverseIndexFullReader = reverseIndexFullReader; + this.reverseIndexPrioReader = reverseIndexPrioReader; } public IndexQueryBuilder withSourceTerms(long... sourceTerms) { diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index dcaf5d7a..74ca220f 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -4,6 +4,9 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import it.unimi.dsi.fastutil.longs.*; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.index.query.filter.QueryFilterAllOf; +import nu.marginalia.index.query.filter.QueryFilterAnyOf; +import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.results.model.ids.CombinedDocIdList; import nu.marginalia.index.results.model.ids.DocMetadataList; import nu.marginalia.index.model.QueryParams; @@ -138,8 +141,16 @@ public class StatefulIndex { for (int i = 1; i < elements.size(); i++) { head.addInclusionFilter(combinedIndexReader.hasWordFull(elements.getLong(i))); } - queryHeads.add(head); + + // If there are few paths, we can afford to check the priority index as well + if (paths.size() < 4) { + var prioHead = combinedIndexReader.findPriorityWord(elements.getLong(0)); + for (int i = 1; i < elements.size(); i++) { + prioHead.addInclusionFilter(combinedIndexReader.hasWordPrio(elements.getLong(i))); + } + queryHeads.add(prioHead); + } } // Add additional conditions to the query heads From c583a538b15f5a4185df4cc92c03105b004666ca Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 14:03:35 +0200 Subject: [PATCH 52/90] (search) Add implicit coherence constraints based on segmentation --- .../query_parser/QueryExpansion.java | 33 ++++++++++++++----- .../searchquery/svc/QueryFactory.java | 7 ++-- .../query/svc/QueryFactoryTest.java | 11 +++++++ 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index d4e324fa..0c9fa453 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -11,7 +11,6 @@ import org.apache.commons.lang3.StringUtils; import java.util.*; import java.util.regex.Pattern; -import java.util.stream.Collectors; import java.util.stream.IntStream; /** Responsible for expanding a query, that is creating alternative branches of query execution @@ -25,8 +24,7 @@ public class QueryExpansion { private final List expansionStrategies = List.of( this::joinDashes, this::splitWordNum, - this::joinTerms, - this::createSegments + this::joinTerms ); @Inject @@ -37,7 +35,7 @@ public class QueryExpansion { this.lexicon = lexicon; } - public String expandQuery(List words) { + public Expansion expandQuery(List words) { QWordGraph graph = new QWordGraph(words); @@ -45,7 +43,11 @@ public class QueryExpansion { strategy.expand(graph); } - return QWordPathsRenderer.render(graph); + List> coherences = createSegments(graph); + + var compiled = QWordPathsRenderer.render(graph); + + return new Expansion(compiled, coherences); } private static final Pattern dashPattern = Pattern.compile("-"); @@ -99,8 +101,12 @@ public class QueryExpansion { /** Create an alternative interpretation of the query that replaces a sequence of words * with a word n-gram. This makes it so that when possible, the order of words in the document * matches the order of the words in the query. + * + * The function modifies the graph in place, adding new variants to the graph; but also + * returns a list of the new groupings that were added. */ - public void createSegments(QWordGraph graph) { + public List> createSegments(QWordGraph graph) + { List nodes = new ArrayList<>(); for (var qw : graph) { @@ -118,25 +124,32 @@ public class QueryExpansion { allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); if (allSegments.isEmpty()) { - return; + return List.of(); } Set bestSegmentation = findBestSegmentation(allSegments); + List> coherences = new ArrayList<>(); + for (var segment : bestSegmentation) { int start = segment.start(); int end = segment.start() + segment.length(); - var word = IntStream.range(start, end) + List components =IntStream.range(start, end) .mapToObj(nodes::get) .map(QWord::word) - .collect(Collectors.joining("_")); + .toList(); + coherences.add(components); + + String word = String.join("_", components); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } + return coherences; + } private Set findBestSegmentation(List allSegments) { @@ -178,4 +191,6 @@ public class QueryExpansion { public interface ExpansionStrategy { void expand(QWordGraph graph); } + + public record Expansion(String compiledQuery, List> extraCoherences) {} } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 15596d5c..382f62a8 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -137,10 +137,11 @@ public class QueryFactory { limits = limits.forSingleDomain(); } + var expansion = queryExpansion.expandQuery(searchTermsInclude); + searchTermCoherences.addAll(expansion.extraCoherences()); + var searchQuery = new SearchQuery( - queryExpansion.expandQuery( - searchTermsInclude - ), + expansion.compiledQuery(), searchTermsInclude, searchTermsExclude, searchTermsAdvice, diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 622130b7..d07e2d80 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -178,4 +178,15 @@ public class QueryFactoryTest { System.out.println(subquery.compiledQuery); } + + @Test + public void testExpansion2() { + + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("need for speed").query; + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + + } + } \ No newline at end of file From cb4b824a850398ab863d01ce6158c0bd6a3b1fe3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 14:04:35 +0200 Subject: [PATCH 53/90] (index) Split ngram and regular keyword bm25 calculation and add ngram score as a bonus --- .../model/results/ResultRankingContext.java | 13 ++++++++++ .../nu/marginalia/index/IndexGrpcService.java | 13 +++++++++- .../ranking/results/ResultValuator.java | 5 ++-- .../results/factors/Bm25FullGraphVisitor.java | 25 ++++++++++++++++++- 4 files changed, 52 insertions(+), 4 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index 9052345a..01c017f0 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -3,11 +3,17 @@ package nu.marginalia.api.searchquery.model.results; import lombok.ToString; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import java.util.BitSet; + @ToString public class ResultRankingContext { private final int docCount; public final ResultRankingParameters params; + + public final BitSet regularMask; + public final BitSet ngramsMask; + /** CqDataInt associated with frequency information of the terms in the query * in the full index. The dataset is indexed by the compiled query. */ public final CqDataInt fullCounts; @@ -18,11 +24,18 @@ public class ResultRankingContext { public ResultRankingContext(int docCount, ResultRankingParameters params, + BitSet ngramsMask, CqDataInt fullCounts, CqDataInt prioCounts) { this.docCount = docCount; this.params = params; + + this.ngramsMask = ngramsMask; + + this.regularMask = new BitSet(ngramsMask.length()); + this.regularMask.xor(ngramsMask); + this.fullCounts = fullCounts; this.priorityCounts = prioCounts; } diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 3eb2f5d7..50fb1eb8 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -9,6 +9,7 @@ import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.longs.LongArrayList; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; @@ -204,7 +205,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return new SearchResultSet(List.of()); } - ResultRankingContext rankingContext = createRankingContext(params.rankingParams, params.compiledQueryIds); + ResultRankingContext rankingContext = createRankingContext(params.rankingParams, + params.compiledQuery, + params.compiledQueryIds); var queryExecution = new QueryExecution(rankingContext, params.fetchSize); @@ -415,20 +418,28 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, + CompiledQuery compiledQuery, CompiledQueryLong compiledQueryIds) { int[] full = new int[compiledQueryIds.size()]; int[] prio = new int[compiledQueryIds.size()]; + BitSet ngramsMask = new BitSet(compiledQuery.size()); + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { long id = compiledQueryIds.at(idx); full[idx] = index.getTermFrequency(id); prio[idx] = index.getTermFrequencyPrio(id); + + if (compiledQuery.at(idx).contains("_")) { + ngramsMask.set(idx); + } } return new ResultRankingContext(index.getTotalDocCount(), rankingParams, + ngramsMask, new CqDataInt(full), new CqDataInt(prio)); } diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 4d257349..d233651b 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -76,7 +76,8 @@ public class ResultValuator { double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta); - double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bestBM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); @@ -84,7 +85,7 @@ public class ResultValuator { // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + overallPartPositive, overallPartNegative); + return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + bestBM25N + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java index 9c46261d..4105ed6b 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java @@ -7,6 +7,7 @@ import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.model.idx.WordMetadata; +import java.util.BitSet; import java.util.List; public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { @@ -19,15 +20,33 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { private final int docCount; private final int length; - public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, + private final BitSet mask; + + private Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, CqDataLong wordMetaData, int length, + BitSet mask, ResultRankingContext ctx) { this.length = length; this.bm25Parameters = bm25Parameters; this.docCount = ctx.termFreqDocCount(); this.wordMetaData = wordMetaData; this.frequencies = ctx.fullCounts; + this.mask = mask; + } + + public static Bm25FullGraphVisitor forRegular(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + int length, + ResultRankingContext ctx) { + return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.regularMask, ctx); + } + + public static Bm25FullGraphVisitor forNgrams(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + int length, + ResultRankingContext ctx) { + return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.ngramsMask, ctx); } @Override @@ -50,6 +69,10 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { @Override public double onLeaf(int idx) { + if (!mask.get(idx)) { + return 0; + } + double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); int freq = frequencies.get(idx); From 973ced7b1333cd3ed280cea5d02e7f16c4a665a4 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 14:12:16 +0200 Subject: [PATCH 54/90] (index) Omit absent terms from coherence checks --- .../index/results/model/TermCoherenceGroupList.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java index 4b119c60..67b5fd60 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java +++ b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java @@ -40,7 +40,12 @@ public record TermCoherenceGroupList(List words) { long overlap = 0xFF_FFFF_FFFF_FFFFL; for (var word : words) { - overlap &= documents.getTermMetadata(word, combinedId); + long meta = documents.getTermMetadata(word, combinedId); + + // if the word is not present in the document, we omit it from the coherence check + if (meta != 0L) { + overlap &= meta; + } } return WordMetadata.decodePositions(overlap) != 0L; From de0e56f02707a5e05798156f4650d3da24e6efec Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 14:20:01 +0200 Subject: [PATCH 55/90] (index) Remove position overlap check, coherences will do the work instead --- .../results/IndexResultValuationContext.java | 42 +------------------ 1 file changed, 1 insertion(+), 41 deletions(-) diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java index 3a7f157b..1ef1f4b4 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java @@ -97,13 +97,12 @@ public class IndexResultValuationContext { boolean allSynthetic = !CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isAbsent); int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - boolean noneOverlap = wordMetasQuery.root.visit(new PositionOverlapOperator(wordMetasQuery.data)) != 0; if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { return null; } - if (flagsCount == 0 && !allSynthetic && (positionsCount == 0 || noneOverlap)) + if (flagsCount == 0 && !allSynthetic && positionsCount == 0) return null; double score = searchResultValuator.calculateSearchResultValue( @@ -167,42 +166,3 @@ public class IndexResultValuationContext { } } - -class PositionOverlapOperator implements CqExpression.LongVisitor { - private final CqDataLong wordMetaData; - - PositionOverlapOperator(CqDataLong wordMetaData) { - this.wordMetaData = wordMetaData; - } - - @Override - public long onAnd(List parts) { - long positions = ~0; - long flags = 0; - - for (var part : parts) { - long pv = part.visit(this); - - flags |= pv; - positions &= pv; - } - - return positions | (flags & WordMetadata.FLAGS_MASK); - } - - @Override - public long onOr(List parts) { - long ret = 0; - - for (var part : parts) { - ret |= part.visit(this); - } - - return ret; - } - - @Override - public long onLeaf(int idx) { - return wordMetaData.get(idx); - } -} From 2f0b648fad14637faf8a150ba701812ad306e650 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 16:50:26 +0200 Subject: [PATCH 56/90] (index) Add jaccard index term to boost results based on term overlap --- .../ranking/results/ResultValuator.java | 11 ++++--- .../results/factors/TermCoherenceFactor.java | 32 +++++++++++++++++-- .../factors/TermCoherenceFactorTest.java | 4 +-- 3 files changed, 38 insertions(+), 9 deletions(-) diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index d233651b..8dcebc7a 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -74,18 +74,19 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double bestTcf = rankingParams.tcfWeight * termCoherenceFactor.calculate(wordMeta); + double tcfOverlap = rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta); + double tcfJaccard = rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); - double bestBM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bestBM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bestBM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); + double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * bestTcf + bestBM25F + bestBM25P + bestBM25N + overallPartPositive, overallPartNegative); + return normalize(1.5 * tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index e617549d..ce562d43 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -2,23 +2,51 @@ package nu.marginalia.ranking.results.factors; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.model.idx.WordMetadata; /** Rewards documents where terms appear frequently within the same sentences */ public class TermCoherenceFactor { - public double calculate(CompiledQueryLong wordMetadataQuery) { + public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, score -> score >>> WordMetadata.POSITIONS_SHIFT); return bitsSetFactor(mask); } + public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) { + double sum = 0; + int cnt = 0; + + for (int i = 0; i < wordMetadataQuery.size(); i++) { + if (!ctx.regularMask.get(i)) continue; + + long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i)); + + for (int j = i + 1; j < wordMetadataQuery.size(); j++) { + if (!ctx.regularMask.get(j)) continue; + + long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j)); + + long quot = Long.bitCount(imask & jmask); + long rem = Long.bitCount(imask | jmask); + + if (rem != 0) { + sum += quot/(double) rem; + cnt++; + } + } + } + + return sum / cnt; + } + double bitsSetFactor(long mask) { final int bitsSetInMask = Long.bitCount(mask); - return Math.pow(bitsSetInMask/(float) WordMetadata.POSITIONS_COUNT, 0.25); + return Math.pow(bitsSetInMask/(double) WordMetadata.POSITIONS_COUNT, 0.25); } diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java index d0abe443..5d2b47c9 100644 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java @@ -30,7 +30,7 @@ class TermCoherenceFactorTest { assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); assertEquals(1.0, - termCoherenceFactor.calculate( + termCoherenceFactor.calculateOverlap( allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) ) ); @@ -47,7 +47,7 @@ class TermCoherenceFactorTest { assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - assertEquals(0, termCoherenceFactor.calculate(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); + assertEquals(0, termCoherenceFactor.calculateOverlap(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); } @Test @SuppressWarnings("unchecked") From 44b33798f30c59c3fd258bff8873e683ba86f7df Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 17:40:16 +0200 Subject: [PATCH 57/90] (index) Clean up jaccard index term code and down-tune the parameter's importance a bit --- .../ranking/results/ResultValuator.java | 10 +++-- .../results/factors/TermCoherenceFactor.java | 40 +++++++++++++++---- 2 files changed, 40 insertions(+), 10 deletions(-) diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 8dcebc7a..a8718e3d 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -74,8 +74,8 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double tcfOverlap = rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta); - double tcfJaccard = rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); + double tcfOverlap = 1.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta); + double tcfJaccard = 0.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); @@ -86,7 +86,11 @@ public class ResultValuator { // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize(1.5 * tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative); + return normalize( + tcfOverlap + tcfJaccard + + bM25F + bM25P + bM25N + + overallPartPositive, + overallPartNegative); } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index ce562d43..d8739a8c 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -9,6 +9,10 @@ import nu.marginalia.model.idx.WordMetadata; */ public class TermCoherenceFactor { + /** Calculate a factor that rewards the best total position overlap + * between the terms in the query. This is high when all the terms + * found in the same sentences. + */ public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, score -> score >>> WordMetadata.POSITIONS_SHIFT); @@ -16,31 +20,53 @@ public class TermCoherenceFactor { return bitsSetFactor(mask); } + /** Calculate a factor that rewards the best average mutual Jaccard index + * between the terms in the query. This is high when the several terms are frequently + * found in the same sentences. + */ public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) { double sum = 0; int cnt = 0; for (int i = 0; i < wordMetadataQuery.size(); i++) { - if (!ctx.regularMask.get(i)) continue; + + // Skip terms that are not in the regular mask + if (!ctx.regularMask.get(i)) + continue; long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i)); + // Skip terms that are not in the document + if (imask == 0L) + continue; + for (int j = i + 1; j < wordMetadataQuery.size(); j++) { - if (!ctx.regularMask.get(j)) continue; + + // Skip terms that are not in the regular mask + if (!ctx.regularMask.get(j)) + continue; long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j)); + // Skip terms that are not in the document + if (jmask == 0L) + continue; + long quot = Long.bitCount(imask & jmask); long rem = Long.bitCount(imask | jmask); - if (rem != 0) { - sum += quot/(double) rem; - cnt++; - } + // rem is always > 0 because imask and jmask are not both 0 + + sum += quot/(double) rem; + cnt++; } } - return sum / cnt; + if (cnt != 0) { + return sum / cnt; + } else { + return 0; + } } double bitsSetFactor(long mask) { From a09c84e1b8f02764a96aa064b2b842013a55c153 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 17 Apr 2024 17:54:32 +0200 Subject: [PATCH 58/90] (query) Modify tokenizer to match the behavior of the sentence extractor This must match, otherwise a query like "plato's republic" won't match the indexed keywords, since they would strip the possessive. --- .../functions/searchquery/query_parser/QueryTokenizer.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java index b12d68a9..80f05808 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java @@ -2,6 +2,7 @@ package nu.marginalia.functions.searchquery.query_parser; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.language.encoding.AsciiFlattener; +import nu.marginalia.language.sentence.SentenceExtractorStringUtils; import java.util.ArrayList; import java.util.List; @@ -54,7 +55,7 @@ public class QueryTokenizer { } String displayStr = query.substring(i, end); - String str = displayStr.toLowerCase(); + String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr); tokens.add(new QueryToken.LiteralTerm(str, displayStr)); From 462aa9af261c6104faf21e815fab9fa287cb53c0 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 18 Apr 2024 10:36:15 +0200 Subject: [PATCH 59/90] (query) Update ranking parameters with new variables for bm25 ngrams and tcf mutual jaccard The change also makes it so that as long as the values are defaults, they don't need to be sent over the wire and decoded. --- .../api/searchquery/IndexProtobufCodec.java | 12 ++++++++++-- .../model/results/ResultRankingParameters.java | 8 ++++++-- .../api/src/main/protobuf/query-api.proto | 10 ++++++---- .../functions/searchquery/svc/QueryFactory.java | 1 - .../marginalia/ranking/results/ResultValuator.java | 4 ++-- 5 files changed, 24 insertions(+), 11 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 4d2cf7a6..bf0f4b64 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -79,6 +79,9 @@ public class IndexProtobufCodec { } public static ResultRankingParameters convertRankingParameterss(RpcResultRankingParameters params) { + if (params == null) + return ResultRankingParameters.sensibleDefaults(); + return new ResultRankingParameters( new Bm25Parameters(params.getFullK(), params.getFullB()), new Bm25Parameters(params.getPrioK(), params.getPrioB()), @@ -89,8 +92,10 @@ public class IndexProtobufCodec { params.getShortSentenceThreshold(), params.getShortSentencePenalty(), params.getBm25FullWeight(), + params.getBm25NgramWeight(), params.getBm25PrioWeight(), - params.getTcfWeight(), + params.getTcfJaccardWeight(), + params.getTcfOverlapWeight(), ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()), params.getTemporalBiasWeight() ); @@ -111,9 +116,12 @@ public class IndexProtobufCodec { .setShortSentenceThreshold(rankingParams.shortSentenceThreshold) .setShortSentencePenalty(rankingParams.shortSentencePenalty) .setBm25FullWeight(rankingParams.bm25FullWeight) + .setBm25NgramWeight(rankingParams.bm25NgramWeight) .setBm25PrioWeight(rankingParams.bm25PrioWeight) - .setTcfWeight(rankingParams.tcfWeight) + .setTcfOverlapWeight(rankingParams.tcfOverlapWeight) + .setTcfJaccardWeight(rankingParams.tcfJaccardWeight) .setTemporalBiasWeight(rankingParams.temporalBiasWeight); + if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) { builder.setTemporalBias(temporalBias); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java index a16ccf8b..04a5f8e2 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java @@ -32,8 +32,10 @@ public class ResultRankingParameters { public double shortSentencePenalty; public double bm25FullWeight; + public double bm25NgramWeight; public double bm25PrioWeight; - public double tcfWeight; + public double tcfJaccardWeight; + public double tcfOverlapWeight; public TemporalBias temporalBias; public double temporalBiasWeight; @@ -49,8 +51,10 @@ public class ResultRankingParameters { .shortSentenceThreshold(2) .shortSentencePenalty(5) .bm25FullWeight(1.) + .bm25NgramWeight(.25) .bm25PrioWeight(1.) - .tcfWeight(2.) + .tcfOverlapWeight(3.) + .tcfJaccardWeight(1) .temporalBias(TemporalBias.NONE) .temporalBiasWeight(1. / (5.)) .build(); diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index bae06e66..db6d4a35 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -130,10 +130,12 @@ message RpcResultRankingParameters { int32 shortSentenceThreshold = 9; double shortSentencePenalty = 10; double bm25FullWeight = 11; - double bm25PrioWeight = 12; - double tcfWeight = 13; - RpcTemporalBias temporalBias = 14; - double temporalBiasWeight = 15; + double bm25NgramWeight = 12; + double bm25PrioWeight = 13; + double tcfOverlapWeight = 14; + double tcfJaccardWeight = 15; + RpcTemporalBias temporalBias = 16; + double temporalBiasWeight = 17; } /* Defines a single subquery */ diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index 382f62a8..ab4018ef 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -159,7 +159,6 @@ public class QueryFactory { .domains(domainIds) .queryLimits(limits) .searchSetIdentifier(params.identifier()) - .rankingParams(ResultRankingParameters.sensibleDefaults()) .queryStrategy(queryStrategy); SearchSpecification specs = specsBuilder.build(); diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index a8718e3d..16bfa4a9 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -74,8 +74,8 @@ public class ResultValuator { + temporalBias + flagsPenalty; - double tcfOverlap = 1.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateOverlap(wordMeta); - double tcfJaccard = 0.5 * rankingParams.tcfWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); + double tcfOverlap = rankingParams.tcfOverlapWeight * termCoherenceFactor.calculateOverlap(wordMeta); + double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); From def36719d33674aa43a89369bccc9fd6efd99af5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 18 Apr 2024 10:37:51 +0200 Subject: [PATCH 60/90] (query) Minor code cleanup --- .../searchquery/query_parser/QueryExpansion.java | 11 +++++------ .../nu/marginalia/query/svc/QueryFactoryTest.java | 7 +++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 0c9fa453..6dc5b6e1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -137,19 +137,18 @@ public class QueryExpansion { int start = segment.start(); int end = segment.start() + segment.length(); - List components =IntStream.range(start, end) - .mapToObj(nodes::get) - .map(QWord::word) - .toList(); - + List components = new ArrayList<>(end - start); + for (int i = start; i < end; i++) { + components.add(nodes.get(i).word()); + } coherences.add(components); + // Create an n-gram search term for the segment String word = String.join("_", components); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } return coherences; - } private Set findBestSegmentation(List allSegments) { diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index d07e2d80..319b4095 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -189,4 +189,11 @@ public class QueryFactoryTest { } + @Test + public void testExpansion3() { + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("plato's republic").query; + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + } } \ No newline at end of file From 6102fd99bff4c31bae1f501df16ed59e0ba58476 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 18 Apr 2024 10:44:08 +0200 Subject: [PATCH 61/90] (qs) Improve logging --- .../java/nu/marginalia/query/QueryService.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryService.java b/code/services-core/query-service/java/nu/marginalia/query/QueryService.java index b7dcc04c..d8a9c526 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryService.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryService.java @@ -7,6 +7,8 @@ import nu.marginalia.functions.searchquery.QueryGRPCService; import nu.marginalia.service.discovery.property.ServicePartition; import nu.marginalia.service.server.BaseServiceParams; import nu.marginalia.service.server.Service; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import spark.Spark; import java.io.IOException; @@ -14,6 +16,8 @@ import java.util.List; public class QueryService extends Service { + private static final Logger logger = LoggerFactory.getLogger(QueryService.class); + @SneakyThrows @Inject public QueryService(BaseServiceParams params, @@ -31,6 +35,9 @@ public class QueryService extends Service { Spark.exception(Exception.class, (e, request, response) -> { response.status(500); + + logger.info("Exception in query service", e); + try { e.printStackTrace(response.raw().getWriter()); } catch (IOException ex) { From e419e26f3a9c1f3aa558f56996e83a23abb82a32 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 18 Apr 2024 10:47:12 +0200 Subject: [PATCH 62/90] (proto) Improve handling of omitted parameters --- .../api/searchquery/IndexProtobufCodec.java | 4 ++++ .../api/searchquery/QueryProtobufCodec.java | 18 +++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index bf0f4b64..8a1c5209 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -104,6 +104,10 @@ public class IndexProtobufCodec { public static RpcResultRankingParameters convertRankingParameterss(ResultRankingParameters rankingParams, RpcTemporalBias temporalBias) { + if (rankingParams == null) { + rankingParams = ResultRankingParameters.sensibleDefaults(); + } + var builder = RpcResultRankingParameters.newBuilder() .setFullB(rankingParams.fullParams.b()) .setFullK(rankingParams.fullParams.k()) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 2907992d..51d0a4d6 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -39,7 +39,9 @@ public class QueryProtobufCodec { else builder.setQueryStrategy(request.getQueryStrategy()); - builder.setParameters(IndexProtobufCodec.convertRankingParameterss(query.specs.rankingParams, request.getTemporalBias())); + if (query.specs.rankingParams != null) { + builder.setParameters(IndexProtobufCodec.convertRankingParameterss(query.specs.rankingParams, request.getTemporalBias())); + } return builder.build(); } @@ -62,12 +64,14 @@ public class QueryProtobufCodec { // Query strategy may be overridden by the query, but if not, use the one from the request builder.setQueryStrategy(query.specs.queryStrategy.name()); - builder.setParameters(IndexProtobufCodec.convertRankingParameterss( - query.specs.rankingParams, - RpcTemporalBias.newBuilder().setBias( - RpcTemporalBias.Bias.NONE) - .build()) - ); + if (query.specs.rankingParams != null) { + builder.setParameters(IndexProtobufCodec.convertRankingParameterss( + query.specs.rankingParams, + RpcTemporalBias.newBuilder().setBias( + RpcTemporalBias.Bias.NONE) + .build()) + ); + } return builder.build(); } From e79ab0c70e20a5b57e366d6e6f70c995ba9f6931 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 11:00:56 +0200 Subject: [PATCH 63/90] (qs) Basic query debug feature --- .../results/ResultRankingParameters.java | 11 +- .../searchquery/QueryGRPCService.java | 20 ++- .../searchquery/svc/QueryFactory.java | 5 +- .../query/svc/QueryFactoryTest.java | 2 +- .../marginalia/query/QueryBasicInterface.java | 103 +++++++++++++-- .../nu/marginalia/query/QueryService.java | 6 +- .../resources/templates/qdebug.hdb | 121 ++++++++++++++++++ 7 files changed, 246 insertions(+), 22 deletions(-) create mode 100644 code/services-core/query-service/resources/templates/qdebug.hdb diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java index 04a5f8e2..e54a994d 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingParameters.java @@ -1,11 +1,12 @@ package nu.marginalia.api.searchquery.model.results; -import lombok.AllArgsConstructor; -import lombok.Builder; -import lombok.EqualsAndHashCode; -import lombok.ToString; +import lombok.*; -@Builder @AllArgsConstructor @ToString @EqualsAndHashCode +@Builder +@AllArgsConstructor +@ToString +@EqualsAndHashCode +@Getter // getter for the mustache template engine's behalf public class ResultRankingParameters { /** Tuning for BM25 when applied to full document matches */ diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java index d2cdd27d..8ebcabc4 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java @@ -6,7 +6,9 @@ import io.grpc.stub.StreamObserver; import io.prometheus.client.Histogram; import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.api.searchquery.model.query.QueryParams; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.db.DomainBlacklist; import nu.marginalia.index.api.IndexClient; import nu.marginalia.functions.searchquery.svc.QueryFactory; @@ -51,7 +53,7 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { Integer.toString(request.getQueryLimits().getResultsTotal())) .time(() -> { var params = QueryProtobufCodec.convertRequest(request); - var query = queryFactory.createQuery(params); + var query = queryFactory.createQuery(params, null); RpcIndexQuery indexRequest = QueryProtobufCodec.convertQuery(request, query); List bestItems = executeQueries(indexRequest, request.getQueryLimits().getResultsTotal()); @@ -81,16 +83,26 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId())); } - public List executeDirect(String originalQuery, QueryParams params, int count) { - var query = queryFactory.createQuery(params); + public DetailedDirectResult executeDirect( + String originalQuery, + QueryParams params, + ResultRankingParameters rankingParameters, + int count) { - return executeQueries( + var query = queryFactory.createQuery(params, rankingParameters); + + var items = executeQueries( QueryProtobufCodec.convertQuery(originalQuery, query), count) .stream().map(QueryProtobufCodec::convertQueryResult) .toList(); + + return new DetailedDirectResult(query, items); } + public record DetailedDirectResult(ProcessedQuery processedQuery, + List result) {} + @SneakyThrows List executeQueries(RpcIndexQuery indexRequest, int totalSize) { var results = indexClient.executeQueries(indexRequest); diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java index ab4018ef..908eb2e2 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java @@ -17,6 +17,7 @@ import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -37,7 +38,8 @@ public class QueryFactory { - public ProcessedQuery createQuery(QueryParams params) { + public ProcessedQuery createQuery(QueryParams params, + @Nullable ResultRankingParameters rankingParams) { final var query = params.humanQuery(); if (query.length() > 1000) { @@ -156,6 +158,7 @@ public class QueryFactory { .year(year) .size(size) .rank(rank) + .rankingParams(rankingParams) .domains(domainIds) .queryLimits(limits) .searchSetIdentifier(params.identifier()) diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 319b4095..1576fd85 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -51,7 +51,7 @@ public class QueryFactoryTest { new QueryLimits(100, 100, 100, 100), "NONE", QueryStrategy.AUTO, - ResultRankingParameters.TemporalBias.NONE)).specs; + ResultRankingParameters.TemporalBias.NONE), null).specs; } diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java index dc0ae2a0..916f5176 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java @@ -1,7 +1,10 @@ package nu.marginalia.query; +import com.google.common.base.Strings; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.QueryGRPCService; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.model.gson.GsonFactory; @@ -15,7 +18,8 @@ import java.io.IOException; import java.util.Map; public class QueryBasicInterface { - private final MustacheRenderer renderer; + private final MustacheRenderer basicRenderer; + private final MustacheRenderer qdebugRenderer; private final Gson gson = GsonFactory.get(); private final QueryGRPCService queryGRPCService; @@ -25,35 +29,114 @@ public class QueryBasicInterface { QueryGRPCService queryGRPCService ) throws IOException { - this.renderer = rendererFactory.renderer("search"); + this.basicRenderer = rendererFactory.renderer("search"); + this.qdebugRenderer = rendererFactory.renderer("qdebug"); this.queryGRPCService = queryGRPCService; } - public Object handle(Request request, Response response) { - String queryParam = request.queryParams("q"); - if (queryParam == null) { - return renderer.render(new Object()); + public Object handleBasic(Request request, Response response) { + String queryParams = request.queryParams("q"); + if (queryParams == null) { + return basicRenderer.render(new Object()); } int count = request.queryParams("count") == null ? 10 : Integer.parseInt(request.queryParams("count")); int domainCount = request.queryParams("domainCount") == null ? 5 : Integer.parseInt(request.queryParams("domainCount")); String set = request.queryParams("set") == null ? "" : request.queryParams("set"); - var params = new QueryParams(queryParam, new QueryLimits( + var params = new QueryParams(queryParams, new QueryLimits( domainCount, count, 250, 8192 ), set); - var results = queryGRPCService.executeDirect(queryParam, params, count); + var detailedDirectResult = queryGRPCService.executeDirect(queryParams, + params, + ResultRankingParameters.sensibleDefaults(), + count); + + var results = detailedDirectResult.result(); if (request.headers("Accept").contains("application/json")) { response.type("application/json"); return gson.toJson(results); } else { - return renderer.render( - Map.of("query", queryParam, + return basicRenderer.render( + Map.of("query", queryParams, "results", results) ); } } + + public Object handleAdvanced(Request request, Response response) { + String queryString = request.queryParams("q"); + if (queryString == null) { + // Show the default query form if no query is given + return qdebugRenderer.render(Map.of("rankingParams", ResultRankingParameters.sensibleDefaults()) + ); + } + + int count = request.queryParams("count") == null ? 10 : Integer.parseInt(request.queryParams("count")); + int domainCount = request.queryParams("domainCount") == null ? 5 : Integer.parseInt(request.queryParams("domainCount")); + String set = request.queryParams("set") == null ? "" : request.queryParams("set"); + + var queryParams = new QueryParams(queryString, new QueryLimits( + domainCount, count, 250, 8192 + ), set); + + var rankingParams = rankingParamsFromRequest(request); + + var detailedDirectResult = queryGRPCService.executeDirect(queryString, + queryParams, + rankingParams, + count); + + var results = detailedDirectResult.result(); + + return qdebugRenderer.render( + Map.of("query", queryString, + "specs", detailedDirectResult.processedQuery().specs, + "rankingParams", rankingParams, // we can't grab this from the specs as it will null the object if it's the default values + "results", results) + ); + } + + private ResultRankingParameters rankingParamsFromRequest(Request request) { + var sensibleDefaults = ResultRankingParameters.sensibleDefaults(); + + return ResultRankingParameters.builder() + .domainRankBonus(doubleFromRequest(request, "domainRankBonus", sensibleDefaults.domainRankBonus)) + .qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty)) + .shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold)) + .shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty)) + .tcfJaccardWeight(doubleFromRequest(request, "tcfJaccardWeight", sensibleDefaults.tcfJaccardWeight)) + .tcfOverlapWeight(doubleFromRequest(request, "tcfOverlapWeight", sensibleDefaults.tcfOverlapWeight)) + .fullParams(new Bm25Parameters( + doubleFromRequest(request, "fullParams.k1", sensibleDefaults.fullParams.k()), + doubleFromRequest(request, "fullParams.b", sensibleDefaults.fullParams.b()) + )) + .prioParams(new Bm25Parameters( + doubleFromRequest(request, "prioParams.k1", sensibleDefaults.prioParams.k()), + doubleFromRequest(request, "prioParams.b", sensibleDefaults.prioParams.b()) + )) + .temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString()))) + .temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight)) + .shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold)) + .shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty)) + .bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight)) + .bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight)) + .bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight)) + .build(); + } + + double doubleFromRequest(Request request, String param, double defaultValue) { + return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Double.parseDouble(request.queryParams(param)); + } + + int intFromRequest(Request request, String param, int defaultValue) { + return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : Integer.parseInt(request.queryParams(param)); + } + + String stringFromRequest(Request request, String param, String defaultValue) { + return Strings.isNullOrEmpty(request.queryParams(param)) ? defaultValue : request.queryParams(param); + } } diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryService.java b/code/services-core/query-service/java/nu/marginalia/query/QueryService.java index d8a9c526..5a2cc82f 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryService.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryService.java @@ -31,7 +31,11 @@ public class QueryService extends Service { List.of(queryGRPCService, domainLinksService)); - Spark.get("/public/search", queryBasicInterface::handle); + Spark.get("/public/search", queryBasicInterface::handleBasic); + + if (!Boolean.getBoolean("noQdebug")) { + Spark.get("/public/qdebug", queryBasicInterface::handleAdvanced); + } Spark.exception(Exception.class, (e, request, response) -> { response.status(500); diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb new file mode 100644 index 00000000..f9c3917f --- /dev/null +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -0,0 +1,121 @@ + + + + + + + Query Service + + +
    +

    Query Debug Service

    +
    +
    +
    +
    +
    +
    + + {{#with rankingParams}} + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    +
    + + {{/with}} +
    + + + +{{#if specs.query.compiledQuery}} +
    +

    Specs

    + + + + + + +{{#each specs.query.searchTermCoherences}} + + + + +{{/each}} +
    Compiled Query{{specs.query.compiledQuery}}
    Search Terms Include{{#each specs.query.searchTermsInclude}} {{.}} {{/each}}
    Search Terms Exclude{{#each specs.query.searchTermsExclude}} {{.}} {{/each}}
    Search Terms Advice{{#each specs.query.searchTermsAdvice}} {{.}} {{/each}}
    Search Terms Priority{{#each specs.query.searchTermsPriority}} {{.}} {{/each}}
    Coherence Requirement + {{#each .}} + {{.}} + {{/each}} +
    +{{/if}} + +{{#if results}} +
    +

    Results

    +{{#each results}} +
    + {{title}} +
    {{url}}
    +

    {{description}}

    + +
    dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}
    +
    +{{/each}} +{{/if}} + +
    + + \ No newline at end of file From eb74d08f2a8f6257265956ecc95a6f4dbb931da3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 11:46:27 +0200 Subject: [PATCH 64/90] (qs) Additional info in query debug UI --- .../api/searchquery/IndexProtobufCodec.java | 6 ++- .../api/searchquery/QueryProtobufCodec.java | 52 ++++++++++++++++++- .../results/DecoratedSearchResultItem.java | 10 +++- .../results/ResultRankingParameters.java | 3 ++ .../results/debug/ResultRankingDetails.java | 7 +++ .../results/debug/ResultRankingInputs.java | 3 ++ .../results/debug/ResultRankingOutputs.java | 17 ++++++ .../api/src/main/protobuf/query-api.proto | 33 ++++++++++++ .../query_parser/QueryExpansion.java | 1 - .../nu/marginalia/index/IndexGrpcService.java | 50 ++++++++++++++++++ .../results/IndexResultValuationContext.java | 3 +- .../results/IndexResultValuatorService.java | 32 +++++++++--- .../ranking/results/ResultValuator.java | 40 +++++++++++++- .../ranking/results/ResultValuatorTest.java | 9 ++-- .../marginalia/query/QueryBasicInterface.java | 1 + .../resources/templates/qdebug.hdb | 1 + 16 files changed, 250 insertions(+), 18 deletions(-) create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java create mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index 8a1c5209..d582d7ce 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -97,7 +97,8 @@ public class IndexProtobufCodec { params.getTcfJaccardWeight(), params.getTcfOverlapWeight(), ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()), - params.getTemporalBiasWeight() + params.getTemporalBiasWeight(), + params.getExportDebugData() ); } @@ -124,7 +125,8 @@ public class IndexProtobufCodec { .setBm25PrioWeight(rankingParams.bm25PrioWeight) .setTcfOverlapWeight(rankingParams.tcfOverlapWeight) .setTcfJaccardWeight(rankingParams.tcfJaccardWeight) - .setTemporalBiasWeight(rankingParams.temporalBiasWeight); + .setTemporalBiasWeight(rankingParams.temporalBiasWeight) + .setExportDebugData(rankingParams.exportDebugData); if (temporalBias != null && temporalBias.getBias() != RpcTemporalBias.Bias.NONE) { builder.setTemporalBias(temporalBias); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 51d0a4d6..9830e219 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -6,6 +6,9 @@ import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.model.EdgeUrl; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; @@ -126,7 +129,51 @@ public class QueryProtobufCodec { results.getDataHash(), results.getWordsTotal(), results.getBestPositions(), - results.getRankingScore() + results.getRankingScore(), + convertRankingDetails(results.getRankingDetails()) + ); + } + + private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) { + if (rankingDetails == null) + return null; + var inputs = rankingDetails.getInputs(); + var outputs = rankingDetails.getOutput(); + + return new ResultRankingDetails( + convertRankingInputs(inputs), + convertRankingOutputs(outputs) + ); + + } + + private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) { + return new ResultRankingOutputs( + outputs.getAverageSentenceLengthPenalty(), + outputs.getQualityPenalty(), + outputs.getRankingBonus(), + outputs.getTopologyBonus(), + outputs.getDocumentLengthPenalty(), + outputs.getTemporalBias(), + outputs.getFlagsPenalty(), + outputs.getOverallPart(), + outputs.getTcfOverlap(), + outputs.getTcfJaccard(), + outputs.getBM25F(), + outputs.getBM25N(), + outputs.getBM25P() + ); + } + + private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) { + return new ResultRankingInputs( + inputs.getRank(), + inputs.getAsl(), + inputs.getQuality(), + inputs.getSize(), + inputs.getFlagsPenalty(), + inputs.getTopology(), + inputs.getYear() ); } @@ -209,7 +256,8 @@ public class QueryProtobufCodec { rpcDecoratedResultItem.getDataHash(), rpcDecoratedResultItem.getWordsTotal(), rpcDecoratedResultItem.getBestPositions(), - rpcDecoratedResultItem.getRankingScore() + rpcDecoratedResultItem.getRankingScore(), + convertRankingDetails(rpcDecoratedResultItem.getRankingDetails()) ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java index df48ea64..0522e7bc 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java @@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.results; import lombok.Getter; import lombok.ToString; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; import nu.marginalia.model.EdgeUrl; import org.jetbrains.annotations.NotNull; @@ -33,6 +34,9 @@ public class DecoratedSearchResultItem implements Comparable detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null; + + double score = resultValuator.calculateSearchResultValue(wordMetas, + result.encodedDocMetadata, + result.htmlFeatures, + docData.wordsTotal(), + rankingContext, + detailConsumer); + return new DecoratedSearchResultItem( result, docData.url(), @@ -167,15 +180,22 @@ public class IndexResultValuatorService { docData.dataHash(), docData.wordsTotal(), bestPositions(wordMetas), - - resultValuator.calculateSearchResultValue(wordMetas, - result.encodedDocMetadata, - result.htmlFeatures, - docData.wordsTotal(), - rankingContext) + score, + detailsExtractor.get() ); } + private static class ResultRankingDetailsExtractor { + private ResultRankingDetails value = null; + + public ResultRankingDetails get() { + return value; + } + public void set(ResultRankingDetails value) { + this.value = value; + } + } + private long bestPositions(CompiledQueryLong wordMetas) { LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 16bfa4a9..4aec3049 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -3,6 +3,9 @@ package nu.marginalia.ranking.results; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.idx.DocumentFlags; @@ -14,6 +17,9 @@ import com.google.inject.Singleton; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import javax.annotation.Nullable; +import java.util.function.Consumer; + @Singleton public class ResultValuator { final static double scalingFactor = 500.; @@ -31,7 +37,9 @@ public class ResultValuator { long documentMetadata, int features, int length, - ResultRankingContext ctx) + ResultRankingContext ctx, + @Nullable Consumer detailsConsumer + ) { if (wordMeta.isEmpty()) return Double.MAX_VALUE; @@ -84,6 +92,36 @@ public class ResultValuator { double overallPartPositive = Math.max(0, overallPart); double overallPartNegative = -Math.min(0, overallPart); + if (null != detailsConsumer) { + var details = new ResultRankingDetails( + new ResultRankingInputs( + rank, + asl, + quality, + size, + flagsPenalty, + topology, + year + ), + new ResultRankingOutputs( + averageSentenceLengthPenalty, + qualityPenalty, + rankingBonus, + topologyBonus, + documentLengthPenalty, + temporalBias, + flagsPenalty, + overallPart, + tcfOverlap, + tcfJaccard, + bM25F, + bM25N, + bM25P) + ); + + detailsConsumer.accept(details); + } + // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function return normalize( diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index a1b66b04..de88e699 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -62,16 +62,17 @@ class ResultValuatorTest { when(dict.getTermFreq("bob")).thenReturn(10); ResultRankingContext context = new ResultRankingContext(100000, ResultRankingParameters.sensibleDefaults(), + new BitSet(), frequencyData, frequencyData); long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); int features = 0; - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context); + double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null); + double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null); + double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context, null); + double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context, null); System.out.println(titleOnlyLowCount); System.out.println(titleLongOnlyLowCount); diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java index 916f5176..152f6a78 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java @@ -125,6 +125,7 @@ public class QueryBasicInterface { .bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight)) .bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight)) .bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight)) + .exportDebugData(true) .build(); } diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb index f9c3917f..ca072f75 100644 --- a/code/services-core/query-service/resources/templates/qdebug.hdb +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -112,6 +112,7 @@

    {{description}}

    dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}
    +
    {{rankingDetails}}
    {{/each}} {{/if}} From b80a83339beeaec461ddb756684c18bdbf9f3263 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 12:18:53 +0200 Subject: [PATCH 65/90] (qs) Additional info in query debug UI --- .../api/searchquery/IndexProtobufCodec.java | 44 +++++++++++++++++++ .../api/searchquery/QueryProtobufCodec.java | 4 +- .../results/debug/ResultRankingInputs.java | 4 +- .../api/src/main/protobuf/query-api.proto | 6 +-- .../nu/marginalia/index/IndexGrpcService.java | 43 +----------------- .../index/results/IndexMetadataService.java | 14 +++--- .../ranking/results/ResultValuator.java | 4 +- .../resources/templates/qdebug.hdb | 26 ++++++++++- 8 files changed, 88 insertions(+), 57 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index d582d7ce..af783a83 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -3,6 +3,9 @@ package nu.marginalia.api.searchquery; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; +import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; @@ -139,4 +142,45 @@ public class IndexProtobufCodec { return builder.build(); } + + public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) { + if (rankingDetails == null) { + return null; + } + + return RpcResultRankingDetails.newBuilder() + .setInputs(convertRankingInputs(rankingDetails.inputs())) + .setOutput(convertRankingOutput(rankingDetails.outputs())) + .build(); + } + + private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) { + return RpcResultRankingOutputs.newBuilder() + .setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty()) + .setQualityPenalty(outputs.qualityPenalty()) + .setRankingBonus(outputs.rankingBonus()) + .setTopologyBonus(outputs.topologyBonus()) + .setDocumentLengthPenalty(outputs.documentLengthPenalty()) + .setTemporalBias(outputs.temporalBias()) + .setFlagsPenalty(outputs.flagsPenalty()) + .setOverallPart(outputs.overallPart()) + .setTcfOverlap(outputs.tcfOverlap()) + .setTcfJaccard(outputs.tcfJaccard()) + .setBM25F(outputs.bM25F()) + .setBM25N(outputs.bM25N()) + .setBM25P(outputs.bM25P()) + .build(); + } + + private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) { + return RpcResultRankingInputs.newBuilder() + .setRank(inputs.rank()) + .setAsl(inputs.asl()) + .setQuality(inputs.quality()) + .setSize(inputs.size()) + .setTopology(inputs.topology()) + .setYear(inputs.year()) + .addAllFlags(inputs.flags()) + .build(); + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 9830e219..58a20a8a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -171,9 +171,9 @@ public class QueryProtobufCodec { inputs.getAsl(), inputs.getQuality(), inputs.getSize(), - inputs.getFlagsPenalty(), inputs.getTopology(), - inputs.getYear() + inputs.getYear(), + inputs.getFlagsList() ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java index d9aa139f..86169416 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java @@ -1,3 +1,5 @@ package nu.marginalia.api.searchquery.model.results.debug; -public record ResultRankingInputs(int rank, int asl, int quality, int size, int flagsPenalty, int topology, int year) {} +import java.util.List; + +public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List flags) {} diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index f6890239..eb4e48ba 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -150,9 +150,9 @@ message RpcResultRankingInputs { int32 asl = 2; int32 quality = 3; int32 size = 4; - int32 flagsPenalty = 5; - int32 topology = 6; - int32 year = 7; + int32 topology = 5; + int32 year = 6; + repeated string flags = 7; } message RpcResultRankingOutputs { diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index e37d2c0f..1e456d31 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -163,9 +163,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .setBestPositions(result.bestPositions) .setRawItem(rawItem); - var rankingDetails = convertRankingDetails(result.rankingDetails); + var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails); if (rankingDetails != null) { - logger.info(queryMarker, "Ranking details: {}", rankingDetails); decoratedBuilder.setRankingDetails(rankingDetails); } @@ -183,46 +182,6 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } } - private RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) { - if (rankingDetails == null) { - return null; - } - - return RpcResultRankingDetails.newBuilder() - .setInputs(convertRankingInputs(rankingDetails.inputs())) - .setOutput(convertRankingOutput(rankingDetails.outputs())) - .build(); - } - - private RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) { - return RpcResultRankingOutputs.newBuilder() - .setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty()) - .setQualityPenalty(outputs.qualityPenalty()) - .setRankingBonus(outputs.rankingBonus()) - .setTopologyBonus(outputs.topologyBonus()) - .setDocumentLengthPenalty(outputs.documentLengthPenalty()) - .setTemporalBias(outputs.temporalBias()) - .setFlagsPenalty(outputs.flagsPenalty()) - .setOverallPart(outputs.overallPart()) - .setTcfOverlap(outputs.tcfOverlap()) - .setTcfJaccard(outputs.tcfJaccard()) - .setBM25F(outputs.bM25F()) - .setBM25N(outputs.bM25N()) - .setBM25P(outputs.bM25P()) - .build(); - } - - private RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) { - return RpcResultRankingInputs.newBuilder() - .setRank(inputs.rank()) - .setAsl(inputs.asl()) - .setQuality(inputs.quality()) - .setSize(inputs.size()) - .setFlagsPenalty(inputs.flagsPenalty()) - .setTopology(inputs.topology()) - .setYear(inputs.year()) - .build(); - } // exists for test access @SneakyThrows diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java index ce23c3f2..a43f9436 100644 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java @@ -65,13 +65,15 @@ public class IndexMetadataService { for (var term : searchQuery.searchTermsPriority) { if (termToId.containsKey(term)) { - continue; + long id = SearchTermsUtil.getWordId(term); + termIdsPrio.add(id); + } + else { + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termIdsPrio.add(id); + termToId.put(term, id); } - - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termIdsPrio.add(id); - termToId.put(term, id); } return new QuerySearchTerms(termToId, diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 4aec3049..1a89b80b 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -99,9 +99,9 @@ public class ResultValuator { asl, quality, size, - flagsPenalty, topology, - year + year, + DocumentFlags.decode(documentMetadata).stream().map(Enum::name).toList() ), new ResultRankingOutputs( averageSentenceLengthPenalty, diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb index ca072f75..4081317f 100644 --- a/code/services-core/query-service/resources/templates/qdebug.hdb +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -112,7 +112,31 @@

    {{description}}

    dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}
    -
    {{rankingDetails}}
    + {{#with rankingDetails.inputs}} +
    Rank: {{rank}}
    +
    ASL: {{asl}}
    +
    Quality: {{quality}}
    +
    Size: {{size}}
    +
    Topology: {{topology}}
    +
    Year: {{year}}
    +
    Flags: {{#each flags}} {{.}} {{/each}}
    + {{/with}} + {{#with rankingDetails.outputs}} +
    Average Sentence Length Penalty: {{averageSentenceLengthPenalty}}
    +
    Quality Penalty: {{qualityPenalty}}
    +
    Ranking Bonus: {{rankingBonus}}
    +
    Topology Bonus: {{topologyBonus}}
    +
    Document Length Penalty: {{documentLengthPenalty}}
    +
    Temporal Bias: {{temporalBias}}
    +
    Flags Penalty: {{flagsPenalty}}
    +
    Overall Part: {{overallPart}}
    +
    TCF Overlap: {{tcfOverlap}}
    +
    TCF Jaccard: {{tcfJaccard}}
    +
    BM25 Full: {{bM25F}}
    +
    BM25 Ngram: {{bM25N}}
    +
    BM25 Prio: {{bM25P}}
    + {{/with}} + {{/each}} {{/if}} From 0dcca0cb8305bceb9cff9f80cfaec39fd1cb5f03 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 12:19:26 +0200 Subject: [PATCH 66/90] (index) Fix TCF bug where the ngram terms would be considered instead of the regular ones due to a logical derp --- .../api/searchquery/model/results/ResultRankingContext.java | 6 +++++- .../ranking/results/factors/TermCoherenceFactor.java | 5 ++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index 01c017f0..aca77bd5 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -34,7 +34,11 @@ public class ResultRankingContext { this.ngramsMask = ngramsMask; this.regularMask = new BitSet(ngramsMask.length()); - this.regularMask.xor(ngramsMask); + for (int i = 0; i < ngramsMask.length(); i++) { + if (!ngramsMask.get(i)) { + regularMask.set(i); + } + } this.fullCounts = fullCounts; this.priorityCounts = prioCounts; diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index d8739a8c..f535403c 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -14,6 +14,9 @@ public class TermCoherenceFactor { * found in the same sentences. */ public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { + if (wordMetadataQuery.size() <= 2) + return 0; + long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, score -> score >>> WordMetadata.POSITIONS_SHIFT); @@ -62,7 +65,7 @@ public class TermCoherenceFactor { } } - if (cnt != 0) { + if (cnt > 0) { return sum / cnt; } else { return 0; From a748fc5448bd27de2ebcb773628ae54f592bdfc5 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 12:41:26 +0200 Subject: [PATCH 67/90] (index, bugfix) Pass url quality to query service --- code/index/java/nu/marginalia/index/IndexGrpcService.java | 1 + 1 file changed, 1 insertion(+) diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 1e456d31..94aedb83 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -159,6 +159,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { .setRankingScore(result.rankingScore) .setTitle(result.title) .setUrl(result.url.toString()) + .setUrlQuality(result.urlQuality) .setWordsTotal(result.wordsTotal) .setBestPositions(result.bestPositions) .setRawItem(rawItem); From f4a2fea4518c7c94a1119e09d0320a36007307ff Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 12:41:48 +0200 Subject: [PATCH 68/90] (ranking, bugfix) Use bm25NgramWeight and not full weight for bM25N --- .../java/nu/marginalia/ranking/results/ResultValuator.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 1a89b80b..a389c8c9 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -86,7 +86,7 @@ public class ResultValuator { double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bM25N = 0.25 * rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); + double bM25N = rankingParams.bm25NgramWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); double overallPartPositive = Math.max(0, overallPart); From f623b37577de79fff453e65d4a17976606ba711a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 13:58:28 +0200 Subject: [PATCH 69/90] (ranking) Suppress NaN:s in ranking output --- .../nu/marginalia/ranking/results/ResultValuator.java | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index a389c8c9..4f149c4d 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -124,11 +124,20 @@ public class ResultValuator { // Renormalize to 0...15, where 0 is the best possible score; // this is a historical artifact of the original ranking function - return normalize( + double ret = normalize( tcfOverlap + tcfJaccard + bM25F + bM25P + bM25N + overallPartPositive, overallPartNegative); + + if (Double.isNaN(ret)) { + if (getClass().desiredAssertionStatus()) { + throw new IllegalStateException("NaN in result value calculation"); + } + + return Double.MAX_VALUE; + } + return ret; } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { From 4489b21528dad290e359864e233e63e11ecf9eb9 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 14:13:12 +0200 Subject: [PATCH 70/90] (ranking) Cleanup --- .../api/searchquery/model/results/SearchResultSet.java | 10 ---------- .../functions/searchquery/QueryGRPCService.java | 7 +++++-- .../nu/marginalia/ranking/results/ResultValuator.java | 4 +++- 3 files changed, 8 insertions(+), 13 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java index c7e76fd9..09468162 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java @@ -19,14 +19,4 @@ public class SearchResultSet { return results.size(); } - public static SearchResultSet combine(SearchResultSet l, SearchResultSet r) { - List combinedItems = new ArrayList<>(l.size() + r.size()); - combinedItems.addAll(l.results); - combinedItems.addAll(r.results); - - // TODO: Do we combine these correctly? - combinedItems.sort(Comparator.comparing(item -> item.rankingScore)); - - return new SearchResultSet(combinedItems); - } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java index 8ebcabc4..98f7fb6f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java @@ -53,7 +53,7 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { Integer.toString(request.getQueryLimits().getResultsTotal())) .time(() -> { var params = QueryProtobufCodec.convertRequest(request); - var query = queryFactory.createQuery(params, null); + var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults()); RpcIndexQuery indexRequest = QueryProtobufCodec.convertQuery(request, query); List bestItems = executeQueries(indexRequest, request.getQueryLimits().getResultsTotal()); @@ -109,7 +109,10 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { results.sort(comparator); results.removeIf(this::isBlacklisted); - return results.subList(0, Math.min(totalSize, results.size())); + if (results.size() > totalSize) { + results = results.subList(0, totalSize); + } + return results; } } diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java index 4f149c4d..1e026b40 100644 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java @@ -137,7 +137,9 @@ public class ResultValuator { return Double.MAX_VALUE; } - return ret; + else { + return ret; + } } private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { From 5165cf6d1506217ba710011334dba7be37bf93cc Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 14:31:57 +0200 Subject: [PATCH 71/90] (ranking) Set regularMask correctly --- .../searchquery/model/results/ResultRankingContext.java | 9 ++------- .../index/java/nu/marginalia/index/IndexGrpcService.java | 5 +++++ .../marginalia/ranking/results/ResultValuatorTest.java | 1 + 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java index aca77bd5..405ab5dc 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/ResultRankingContext.java @@ -25,6 +25,7 @@ public class ResultRankingContext { public ResultRankingContext(int docCount, ResultRankingParameters params, BitSet ngramsMask, + BitSet regularMask, CqDataInt fullCounts, CqDataInt prioCounts) { @@ -32,13 +33,7 @@ public class ResultRankingContext { this.params = params; this.ngramsMask = ngramsMask; - - this.regularMask = new BitSet(ngramsMask.length()); - for (int i = 0; i < ngramsMask.length(); i++) { - if (!ngramsMask.get(i)) { - regularMask.set(i); - } - } + this.regularMask = regularMask; this.fullCounts = fullCounts; this.priorityCounts = prioCounts; diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 94aedb83..806b79d4 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -436,6 +436,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { int[] prio = new int[compiledQueryIds.size()]; BitSet ngramsMask = new BitSet(compiledQuery.size()); + BitSet regularMask = new BitSet(compiledQuery.size()); for (int idx = 0; idx < compiledQueryIds.size(); idx++) { long id = compiledQueryIds.at(idx); @@ -445,11 +446,15 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { if (compiledQuery.at(idx).contains("_")) { ngramsMask.set(idx); } + else { + regularMask.set(idx); + } } return new ResultRankingContext(index.getTotalDocCount(), rankingParams, ngramsMask, + regularMask, new CqDataInt(full), new CqDataInt(prio)); } diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java index de88e699..41906904 100644 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java @@ -63,6 +63,7 @@ class ResultValuatorTest { ResultRankingContext context = new ResultRankingContext(100000, ResultRankingParameters.sensibleDefaults(), new BitSet(), + new BitSet(), frequencyData, frequencyData); From 64baa41e649d5096535b42505250471ac39e06aa Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 19:42:30 +0200 Subject: [PATCH 72/90] (query) Always generate an ngram alternative, suppresses generation of multiple identical query branches --- .../query_parser/QueryExpansion.java | 20 ++++++++++++++++++- .../searchquery/query_parser/model/QWord.java | 20 +++++++++++++++++++ .../query_parser/model/QWordGraph.java | 16 +++++++-------- 3 files changed, 47 insertions(+), 9 deletions(-) diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index b8806ca3..45c9e67f 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -11,6 +11,7 @@ import org.apache.commons.lang3.StringUtils; import java.util.*; import java.util.regex.Pattern; +import java.util.stream.Collectors; /** Responsible for expanding a query, that is creating alternative branches of query execution * to increase the number of results @@ -23,7 +24,8 @@ public class QueryExpansion { private final List expansionStrategies = List.of( this::joinDashes, this::splitWordNum, - this::joinTerms + this::joinTerms, + this::ngramAll ); @Inject @@ -63,6 +65,22 @@ public class QueryExpansion { } + public void ngramAll(QWordGraph graph) { + List parts = new ArrayList<>(); + + for (var qw : graph) { + if (qw.isBeg() || qw.isEnd()) + continue; + + parts.add(qw); + } + + if (parts.size() > 1) { + graph.addVariantForSpan(parts.getFirst(), parts.getLast(), + parts.stream().map(QWord::word).collect(Collectors.joining("_"))); + } + } + // Turn 'MP3' into 'MP-3' public void splitWordNum(QWordGraph graph) { for (var qw : graph) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java index eac2e68b..b3da9086 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWord.java @@ -2,6 +2,8 @@ package nu.marginalia.functions.searchquery.query_parser.model; import ca.rmen.porterstemmer.PorterStemmer; +import java.util.Objects; + public record QWord( int ord, boolean variant, @@ -48,4 +50,22 @@ public record QWord( public String toString() { return STR."q{\{word}}"; } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + QWord qWord = (QWord) o; + return variant == qWord.variant && Objects.equals(word, qWord.word) && Objects.equals(stemmed, qWord.stemmed) && Objects.equals(isOriginal(), qWord.isOriginal()); + } + + @Override + public int hashCode() { + int result = Boolean.hashCode(variant); + result = 31 * result + Objects.hashCode(stemmed); + result = 31 * result + Objects.hashCode(word); + result = 31 * result + Objects.hashCode(isOriginal()); + return result; + } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index a8b1a768..a5ab80d1 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -19,8 +19,8 @@ public class QWordGraph implements Iterable { public record QWordGraphLink(QWord from, QWord to) {} private final List links = new ArrayList<>(); - private final Map> fromTo = new HashMap<>(); - private final Map> toFrom = new HashMap<>(); + private final Map> fromTo = new HashMap<>(); + private final Map> toFrom = new HashMap<>(); private int wordId = 0; @@ -79,8 +79,8 @@ public class QWordGraph implements Iterable { public void addLink(QWord from, QWord to) { links.add(new QWordGraphLink(from, to)); - fromTo.computeIfAbsent(from, k -> new ArrayList<>()).add(to); - toFrom.computeIfAbsent(to, k -> new ArrayList<>()).add(from); + fromTo.computeIfAbsent(from.ord(), k -> new ArrayList<>()).add(to); + toFrom.computeIfAbsent(to.ord(), k -> new ArrayList<>()).add(from); } public List links() { @@ -103,20 +103,20 @@ public class QWordGraph implements Iterable { } public List getNext(QWord word) { - return fromTo.getOrDefault(word, List.of()); + return fromTo.getOrDefault(word.ord(), List.of()); } public List getNextOriginal(QWord word) { - return fromTo.getOrDefault(word, List.of()) + return fromTo.getOrDefault(word.ord(), List.of()) .stream() .filter(QWord::isOriginal) .toList(); } public List getPrev(QWord word) { - return toFrom.getOrDefault(word, List.of()); + return toFrom.getOrDefault(word.ord(), List.of()); } public List getPrevOriginal(QWord word) { - return toFrom.getOrDefault(word, List.of()) + return toFrom.getOrDefault(word.ord(), List.of()) .stream() .filter(QWord::isOriginal) .toList(); From 934167323d40562aec8347b037e1a9f141e9ef4d Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Fri, 19 Apr 2024 20:36:01 +0200 Subject: [PATCH 73/90] (converter) Stopgap fix for some cases of lost crawl data due to HTTP 304. The root cause needs further investigation. --- .../nu/marginalia/converting/processor/DomainProcessor.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index ac10bcb9..7ec0bf29 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -193,6 +193,8 @@ public class DomainProcessor { continue; if (doc.url == null) continue; + if (doc.documentBody.isBlank()) + continue; if (!processedUrls.add(doc.url)) continue; From f46733a47afb24314886f6a536ce12e1e60edf59 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 21 Apr 2024 12:29:25 +0200 Subject: [PATCH 74/90] (ranking) TermCoherenceFactory should be run for size=2 queries --- .../marginalia/ranking/results/factors/TermCoherenceFactor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java index f535403c..3bda0580 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java @@ -14,7 +14,7 @@ public class TermCoherenceFactor { * found in the same sentences. */ public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { - if (wordMetadataQuery.size() <= 2) + if (wordMetadataQuery.size() < 2) return 0; long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, From ad2ac8eee3b2cb9704d0bbec50cbc976a146fd24 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Sun, 21 Apr 2024 12:30:14 +0200 Subject: [PATCH 75/90] (query) Mark flaky test, correct assert on test --- .../searchquery/query_parser/model/QWordGraphTest.java | 2 +- .../test/nu/marginalia/query/svc/QueryFactoryTest.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index f985cd13..df4da566 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -100,7 +100,7 @@ class QWordGraphTest { assertEquals("a b ( c | d )", graph.compileToQuery()); } - @Test + @Test // this test is a bit flaky, the order of the variants is not guaranteed void testCompile5() { // Construct a graph like diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 1576fd85..4310e89d 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -134,7 +134,7 @@ public class QueryFactoryTest { { // tde isn't a stopword, so we should get the normal behavior var specs = parseAndGetSpecs("\"tde shining\""); - assertEquals("tde shining", specs.query.compiledQuery); + assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery); assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice); assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences); } @@ -192,7 +192,7 @@ public class QueryFactoryTest { @Test public void testExpansion3() { long start = System.currentTimeMillis(); - var subquery = parseAndGetSpecs("plato's republic").query; + var subquery = parseAndGetSpecs("buy rimonabant buy acomplia"); System.out.println("Time: " + (System.currentTimeMillis() - start)); System.out.println(subquery); } From 8a891c215930df3ae4cb3d5813505dc938f7ec83 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 12:34:28 +0200 Subject: [PATCH 76/90] (crawler/converter) Remove legacy junk from parquet migration --- .../nu/marginalia/extractor/AtagExporter.java | 2 +- .../nu/marginalia/extractor/FeedExporter.java | 2 +- .../extractor/TermFrequencyExporter.java | 2 +- .../crawling/body/ContentTypeLogic.java | 4 +- .../crawling/io/CrawledDomainReader.java | 33 +---- .../crawling/io/CrawledDomainWriter.java | 66 ---------- .../crawling/io/CrawlerOutputFile.java | 27 ----- ...ibleLegacySerializableCrawlDataStream.java | 113 ------------------ ...FastLegacySerializableCrawlDataStream.java | 74 ------------ .../ParquetSerializableCrawlDataStream.java | 31 ++--- .../crawling/model/CrawledDocument.java | 6 - .../crawling/model/CrawledDomain.java | 9 -- .../crawling/model/SerializableCrawlData.java | 1 - ...rawledDocumentParquetRecordFileReader.java | 1 - ...rawledDocumentParquetRecordFileWriter.java | 2 - .../crawling-model/java/plan/CrawlPlan.java | 105 ---------------- .../java/plan/CrawlPlanLoader.java | 25 ---- ...edDocumentParquetRecordFileWriterTest.java | 34 +++++- .../marginalia/converting/ConverterMain.java | 76 ++++++++++-- .../converting/model/CrawlPlan.java | 15 +++ .../marginalia/converting/model/WorkDir.java | 13 ++ .../java/nu/marginalia/crawl/CrawlerMain.java | 2 +- .../crawl/retreival/CrawlerRetreiver.java | 1 + .../retreival/fetcher/HttpFetcherImpl.java | 1 - .../retreival/revisit/CrawlerRevisitor.java | 28 +++-- .../crawling/CrawlPlanLoaderTest.java | 51 -------- .../retreival/CrawlerRetreiverTest.java | 16 +-- code/tools/crawl-data-unfcker/build.gradle | 57 --------- .../nu/marginalia/tools/CrawlDataUnfcker.java | 75 ------------ code/tools/crawl-data-unfcker/readme.md | 3 - .../tools/ExperimentRunnerMain.java | 2 +- settings.gradle | 1 - 32 files changed, 175 insertions(+), 703 deletions(-) delete mode 100644 code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java delete mode 100644 code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java delete mode 100644 code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java delete mode 100644 code/process-models/crawling-model/java/plan/CrawlPlan.java delete mode 100644 code/process-models/crawling-model/java/plan/CrawlPlanLoader.java create mode 100644 code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java create mode 100644 code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java delete mode 100644 code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java delete mode 100644 code/tools/crawl-data-unfcker/build.gradle delete mode 100644 code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java delete mode 100644 code/tools/crawl-data-unfcker/readme.md diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java index 3db0a284..acc3a417 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java @@ -61,7 +61,7 @@ public class AtagExporter implements ExporterIf { } Path crawlDataPath = inputDir.resolve(item.relPath()); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { exportLinks(tagWriter, stream); } catch (Exception ex) { diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java index 28a29906..fa925b39 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java @@ -58,7 +58,7 @@ public class FeedExporter implements ExporterIf { } Path crawlDataPath = inputDir.resolve(item.relPath()); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { exportFeeds(tagWriter, stream); } catch (Exception ex) { diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 1e1a2cd5..18fb3261 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -99,7 +99,7 @@ public class TermFrequencyExporter implements ExporterIf { private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) { TLongHashSet words = new TLongHashSet(10_000); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { while (stream.hasNext()) { if (Thread.interrupted()) return; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java index d884dbe5..25d4c8ec 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java @@ -10,7 +10,7 @@ import java.util.regex.Pattern; public class ContentTypeLogic { - private static final Predicate probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt)$").asMatchPredicate(); + private static final Predicate probableHtmlPattern = Pattern.compile("^.*\\.(htm|html|php|txt|md)$").asMatchPredicate(); private static final Predicate probableBinaryPattern = Pattern.compile("^.*\\.[a-z]+$").asMatchPredicate(); private static final Set blockedContentTypes = Set.of("text/css", "text/javascript"); private static final List acceptedContentTypePrefixes = List.of( @@ -29,6 +29,7 @@ public class ContentTypeLogic { this.allowAllContentTypes = allowAllContentTypes; } + /** Returns true if the URL is likely to be a binary file, based on the URL path. */ public boolean isUrlLikeBinary(EdgeUrl url) { String pathLowerCase = url.path.toLowerCase(); @@ -41,6 +42,7 @@ public class ContentTypeLogic { public boolean isAllowableContentType(ContentType contentType) { return isAllowableContentType(contentType.contentType()); } + public boolean isAllowableContentType(String contentType) { if (allowAllContentTypes) return true; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java index dfd6415c..3f8123b2 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java @@ -1,41 +1,18 @@ package nu.marginalia.crawling.io; -import com.google.gson.Gson; -import nu.marginalia.crawling.io.format.CompatibleLegacySerializableCrawlDataStream; -import nu.marginalia.crawling.io.format.FastLegacySerializableCrawlDataStream; import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.model.gson.GsonFactory; import java.io.*; import java.nio.file.Files; import java.nio.file.Path; public class CrawledDomainReader { - private static final Gson gson = GsonFactory.get(); - public CrawledDomainReader() { - } - - public enum CompatibilityLevel { - /** Data order emulates the ordering of the new format. This is slower */ - COMPATIBLE, - /** Data order is not compatible with the new format, but the data itself is */ - FAST, - /** Alias for FAST */ - ANY - } /** An iterator-like access to domain data This must be closed otherwise it will leak off-heap memory! */ - public static SerializableCrawlDataStream createDataStream(CompatibilityLevel compatibilityLevel, - Path fullPath) throws IOException + public static SerializableCrawlDataStream createDataStream(Path fullPath) throws IOException { String fileName = fullPath.getFileName().toString(); - if (fileName.endsWith(".zstd")) { - if (compatibilityLevel == CompatibilityLevel.COMPATIBLE) - return new CompatibleLegacySerializableCrawlDataStream(gson, fullPath.toFile()); - else // if (compatibilityLevel == CompatibilityLevel.FAST or ANY) - return new FastLegacySerializableCrawlDataStream(gson, fullPath.toFile()); - } - else if (fileName.endsWith(".parquet")) { + if (fileName.endsWith(".parquet")) { return new ParquetSerializableCrawlDataStream(fullPath); } else { @@ -44,14 +21,14 @@ public class CrawledDomainReader { } /** An iterator-like access to domain data. This must be closed otherwise it will leak off-heap memory! */ - public static SerializableCrawlDataStream createDataStream(CompatibilityLevel level, Path basePath, String domain, String id) throws IOException { + public static SerializableCrawlDataStream createDataStream(Path basePath, String domain, String id) throws IOException { Path parquetPath = CrawlerOutputFile.getParquetPath(basePath, id, domain); if (Files.exists(parquetPath)) { - return createDataStream(level, parquetPath); + return createDataStream(parquetPath); } else { - return createDataStream(level, CrawlerOutputFile.getLegacyOutputFile(basePath, id, domain)); + throw new FileNotFoundException("No such file: " + parquetPath); } } diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java deleted file mode 100644 index f21715ee..00000000 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainWriter.java +++ /dev/null @@ -1,66 +0,0 @@ -package nu.marginalia.crawling.io; - -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdOutputStream; -import com.google.gson.Gson; -import lombok.SneakyThrows; -import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.model.gson.GsonFactory; - -import java.io.BufferedOutputStream; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardCopyOption; -import java.nio.file.StandardOpenOption; - -public class CrawledDomainWriter implements AutoCloseable { - private final Path outputDir; - private final Gson gson = GsonFactory.get(); - private final Writer writer; - private final Path tmpFile; - private final Path actualFile; - - public CrawledDomainWriter(Path outputDir, String domain, String id) throws IOException { - this.outputDir = outputDir; - - if (!Files.isDirectory(outputDir)) { - throw new IllegalArgumentException("Output dir " + outputDir + " does not exist"); - } - - - // Do the actual writing to a temporary file first, then move it to the actual file when close() is invoked - // this lets us read the old file and compare its contents while writing the new file. It also guards against - // half-written files if the process is killed. - - tmpFile = getOutputFile(id, domain + "_tmp"); - actualFile = getOutputFile(id, domain); - writer = new OutputStreamWriter(new ZstdOutputStream(new BufferedOutputStream(Files.newOutputStream(tmpFile, - StandardOpenOption.CREATE, StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING)), - RecyclingBufferPool.INSTANCE)); - } - - public Path getOutputFile() { - return actualFile; - } - - @SneakyThrows - public void accept(SerializableCrawlData data) { - writer.write(data.getSerialIdentifier()); - writer.write('\n'); - gson.toJson(data, writer); - writer.write('\n'); - } - - private Path getOutputFile(String id, String name) throws IOException { - return CrawlerOutputFile.createLegacyOutputPath(outputDir, id, name); - } - - @Override - public void close() throws IOException { - Files.move(tmpFile, actualFile, StandardCopyOption.REPLACE_EXISTING); - writer.close(); - } -} diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java index 25673f13..05c4797e 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java @@ -8,33 +8,6 @@ import java.nio.file.Path; public class CrawlerOutputFile { - /** Return the Path to a file for the given id and name */ - public static Path getLegacyOutputFile(Path base, String id, String name) { - id = padId(id); - - String first = id.substring(0, 2); - String second = id.substring(2, 4); - - Path destDir = base.resolve(first).resolve(second); - return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd"); - } - - /** Return the Path to a file for the given id and name, creating the prerequisite - * directory structure as necessary. */ - public static Path createLegacyOutputPath(Path base, String id, String name) throws IOException { - id = padId(id); - - String first = id.substring(0, 2); - String second = id.substring(2, 4); - - Path destDir = base.resolve(first).resolve(second); - if (!Files.exists(destDir)) { - Files.createDirectories(destDir); - } - return destDir.resolve(STR."\{id}-\{filesystemSafeName(name)}.zstd"); - } - - private static String filesystemSafeName(String name) { StringBuilder nameSaneBuilder = new StringBuilder(); diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java deleted file mode 100644 index 76ecf7e7..00000000 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/CompatibleLegacySerializableCrawlDataStream.java +++ /dev/null @@ -1,113 +0,0 @@ -package nu.marginalia.crawling.io.format; - -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdInputStream; -import com.google.gson.Gson; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; - -import java.io.*; -import java.nio.file.Path; - -import static java.util.Objects.*; - -/** This class is used to read the old format of crawl data, which was zstd-compressed JSON - * with type delimiters between records. It does its best to preserve the semantics of the - * new format. This is slow. - */ -public class CompatibleLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { - private final Gson gson; - private final BufferedReader bufferedReader; - - private CrawledDomain domain; - private SerializableCrawlData next; - - private final Path path; - private int sizeHint; - - public CompatibleLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { - this.gson = gson; - path = file.toPath(); - domain = findDomain(file); - - bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); - } - - @Override - public int sizeHint() { - return sizeHint; - } - - /** Scan through the file and find the domain record */ - private CrawledDomain findDomain(File file) throws IOException { - try (var br = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE)))) { - for (;;sizeHint++) { - String identifierLine = - requireNonNull(br.readLine(), "No identifier line found"); - String dataLine = - requireNonNull(br.readLine(), "No data line found"); - - if (identifierLine.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - return gson.fromJson(dataLine, CrawledDomain.class); - } - } - } - } - - @Override - public Path path() { - return path; - } - - @Override - public SerializableCrawlData next() throws IOException { - if (hasNext()) { - if (domain != null) { - var ret = domain; - domain = null; - return ret; - } - else { - var ret = next; - next = null; - return ret; - } - } - throw new IllegalStateException("No more data"); - } - - @Override - public boolean hasNext() throws IOException { - if (domain != null || next != null) { - return true; - } - - String identifier = bufferedReader.readLine(); - if (identifier == null) { - bufferedReader.close(); - return false; - } - String data = bufferedReader.readLine(); - if (data == null) { - bufferedReader.close(); - return false; - } - - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - next = null; - return false; // last record is expected to be the domain, so we're done - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDocument.class); - } else { - throw new IllegalStateException("Unknown identifier: " + identifier); - } - return true; - } - - @Override - public void close() throws Exception { - bufferedReader.close(); - } -} diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java deleted file mode 100644 index 09871cf4..00000000 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/FastLegacySerializableCrawlDataStream.java +++ /dev/null @@ -1,74 +0,0 @@ -package nu.marginalia.crawling.io.format; - -import com.github.luben.zstd.RecyclingBufferPool; -import com.github.luben.zstd.ZstdInputStream; -import com.google.gson.Gson; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; - -import java.io.*; -import java.nio.file.Path; - -/** This class is used to read the old format of crawl data, which was zstd-compressed JSON - * with type delimiters between records. It does not preserve the semantics of the new format, - * but it is faster. - */ -public class FastLegacySerializableCrawlDataStream implements AutoCloseable, SerializableCrawlDataStream { - private final Gson gson; - private final BufferedReader bufferedReader; - private SerializableCrawlData next = null; - - private final Path path; - public FastLegacySerializableCrawlDataStream(Gson gson, File file) throws IOException { - this.gson = gson; - bufferedReader = new BufferedReader(new InputStreamReader(new ZstdInputStream(new FileInputStream(file), RecyclingBufferPool.INSTANCE))); - path = file.toPath(); - } - - @Override - public Path path() { - return path; - } - @Override - public SerializableCrawlData next() throws IOException { - if (hasNext()) { - var ret = next; - next = null; - return ret; - } - throw new IllegalStateException("No more data"); - } - - @Override - public boolean hasNext() throws IOException { - if (next != null) - return true; - - String identifier = bufferedReader.readLine(); - if (identifier == null) { - bufferedReader.close(); - return false; - } - String data = bufferedReader.readLine(); - if (data == null) { - bufferedReader.close(); - return false; - } - - if (identifier.equals(CrawledDomain.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDomain.class); - } else if (identifier.equals(CrawledDocument.SERIAL_IDENTIFIER)) { - next = gson.fromJson(data, CrawledDocument.class); - } else { - throw new IllegalStateException("Unknown identifier: " + identifier); - } - return true; - } - - @Override - public void close() throws Exception { - bufferedReader.close(); - } -} diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java index a5fa2d0d..e676e351 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java @@ -29,7 +29,6 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial public ParquetSerializableCrawlDataStream(Path file) throws IOException { path = file; - backingIterator = CrawledDocumentParquetRecordFileReader.stream(file).iterator(); } @@ -79,6 +78,10 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial String statusReason = ""; String redirectDomain = null; + + // The advisory content types are used to signal various states of the crawl + // that are not actual crawled documents. + if (parquetRecord.contentType.equals("x-marginalia/advisory;state=redirect")) { EdgeUrl crawledUrl = new EdgeUrl(parquetRecord.url); redirectDomain = crawledUrl.getDomain().toString(); @@ -103,8 +106,6 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial )); } - private CrawledDocumentParquetRecord previousRecord = null; - private void createDocumentRecord(CrawledDocumentParquetRecord nextRecord) { String bodyString = ""; CrawlerDocumentStatus status = CrawlerDocumentStatus.OK; @@ -115,7 +116,8 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial else if (nextRecord.contentType.startsWith("x-marginalia/advisory;state=robots-txt-skipped")) { status = CrawlerDocumentStatus.ROBOTS_TXT; } - else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { // other advisory stuff we don't want + else if (nextRecord.contentType.startsWith("x-marginalia/advisory")) { + // we don't care about the other advisory content types here return; } else if (nextRecord.body != null) { @@ -135,21 +137,6 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial String etag = nextRecord.etagHeader; String lastModified = nextRecord.lastModifiedHeader; - // If we have a previous record, and it was a 304, and this one is a 200, we'll use the ETag and Last-Modified - // from the previous record, as it's not guaranteed the reference copy will have the same headers due to a bug - // in the crawler. The bug is fixed, but we still need to support old crawls. - // - // This was added in 2024-01-18, so we can remove it in a few months. - - if (previousRecord != null - && previousRecord.url.equals(nextRecord.url) - && previousRecord.httpStatus == 304 - && nextRecord.httpStatus == 200) - { - etag = previousRecord.etagHeader; - lastModified = previousRecord.lastModifiedHeader; - } - nextQ.add(new CrawledDocument("", nextRecord.url, nextRecord.contentType, @@ -166,13 +153,9 @@ public class ParquetSerializableCrawlDataStream implements AutoCloseable, Serial nextRecord.cookies, lastModified, etag)); - - previousRecord = nextRecord; } - public void close() throws IOException { - previousRecord = null; - } + public void close() throws IOException {} @Override public SerializableCrawlData next() throws IOException { diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java index bb344dfb..c809682a 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java @@ -87,12 +87,6 @@ public class CrawledDocument implements SerializableCrawlData { return getHeader("Last-Modified"); } - public static final String SERIAL_IDENTIFIER = "// DOCUMENT"; - @Override - public String getSerialIdentifier() { - return SERIAL_IDENTIFIER; - } - @Override public String getDomain() { if (url == null) diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java index 3add3b8d..adb59bda 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java @@ -27,13 +27,4 @@ public class CrawledDomain implements SerializableCrawlData { return doc.size(); } - public boolean hasCookies() { - return cookies != null && !cookies.isEmpty(); - } - - public static final String SERIAL_IDENTIFIER = "// DOMAIN"; - @Override - public String getSerialIdentifier() { - return SERIAL_IDENTIFIER; - } } diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java index 48b3f65d..01ecaf8d 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java @@ -1,6 +1,5 @@ package nu.marginalia.crawling.model; public interface SerializableCrawlData { - String getSerialIdentifier(); String getDomain(); } diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java index 31d644ec..362eb561 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java @@ -35,7 +35,6 @@ public class CrawledDocumentParquetRecordFileReader { public Integer finish(Integer target) { return target; } }), List.of("statusCode")) - .mapToInt(Integer::valueOf) .count(); } } diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java index 02f33efc..539ff28d 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java @@ -134,8 +134,6 @@ public class CrawledDocumentParquetRecordFileWriter implements AutoCloseable { return; } - - byte[] bodyBytes; String contentType; diff --git a/code/process-models/crawling-model/java/plan/CrawlPlan.java b/code/process-models/crawling-model/java/plan/CrawlPlan.java deleted file mode 100644 index 02164b60..00000000 --- a/code/process-models/crawling-model/java/plan/CrawlPlan.java +++ /dev/null @@ -1,105 +0,0 @@ -package plan; - -import lombok.AllArgsConstructor; -import lombok.NoArgsConstructor; -import lombok.ToString; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.process.log.WorkLog; -import org.apache.logging.log4j.util.Strings; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.function.Predicate; -import java.util.Optional; - -@AllArgsConstructor @NoArgsConstructor @ToString -public class CrawlPlan { - private final Logger logger = LoggerFactory.getLogger(getClass()); - public String jobSpec; - public WorkDir crawl; - public WorkDir process; - - private final static String rootDirRewrite = System.getProperty("crawl.rootDirRewrite"); - - public Path getJobSpec() { - return Path.of(rewrite(jobSpec)); - } - - @AllArgsConstructor @NoArgsConstructor @ToString - public static class WorkDir { - public String dir; - public String logName; - - public Path getDir() { - return Path.of(rewrite(dir)); - } - public Path getLogFile() { - return Path.of(rewrite(dir)).resolve(logName); - } - } - - private static String rewrite(String dir) { - if (rootDirRewrite == null) { - return dir; - } - String[] parts = rootDirRewrite.split(":"); - - return dir.replaceFirst(parts[0], parts[1]); - } - - public Path getCrawledFilePath(String fileName) { - int sp = fileName.lastIndexOf('/'); - - // Normalize the filename - if (sp >= 0 && sp + 1< fileName.length()) - fileName = fileName.substring(sp + 1); - if (fileName.length() < 4) - fileName = Strings.repeat("0", 4 - fileName.length()) + fileName; - - String sp1 = fileName.substring(0, 2); - String sp2 = fileName.substring(2, 4); - return crawl.getDir().resolve(sp1).resolve(sp2).resolve(fileName); - } - - public int countCrawledDomains() { - int count = 0; - for (var ignored : WorkLog.iterable(crawl.getLogFile())) { - count++; - } - return count; - } - - @Deprecated - public Iterable domainsIterable() { - // This is no longer supported - throw new UnsupportedOperationException(); - } - - public Iterable crawlDataIterable(Predicate idPredicate) { - return WorkLog.iterableMap(crawl.getLogFile(), - entry -> { - if (!idPredicate.test(entry.id())) { - return Optional.empty(); - } - - var path = getCrawledFilePath(entry.path()); - - if (!Files.exists(path)) { - logger.warn("File not found: {}", path); - return Optional.empty(); - } - - try { - return Optional.of(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.COMPATIBLE, path)); - } - catch (IOException ex) { - return Optional.empty(); - } - }); - } -} diff --git a/code/process-models/crawling-model/java/plan/CrawlPlanLoader.java b/code/process-models/crawling-model/java/plan/CrawlPlanLoader.java deleted file mode 100644 index cc7aae3f..00000000 --- a/code/process-models/crawling-model/java/plan/CrawlPlanLoader.java +++ /dev/null @@ -1,25 +0,0 @@ -package plan; - -import org.yaml.snakeyaml.Yaml; - -import java.io.FileReader; -import java.io.IOException; -import java.nio.file.Path; - -public class CrawlPlanLoader { - private final Yaml yaml; - - public CrawlPlanLoader() { - yaml = new Yaml(); - } - - public CrawlPlan load(Path yamlFile) throws IOException { - try (var reader = new FileReader(yamlFile.toFile())) { - return yaml.loadAs(reader, CrawlPlan.class); - } - catch (IOException ex) { - throw new IOException("Failed to load crawl plan " + yamlFile, ex); - } - } - -} diff --git a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index 17a8ad73..a0352f29 100644 --- a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -31,6 +31,7 @@ class CrawledDocumentParquetRecordFileWriterTest { @Test void testWriteRead() throws IOException { + // Create a record var original = new CrawledDocumentParquetRecord("www.marginalia.nu", "https://www.marginalia.nu/", "127.0.0.1", @@ -41,22 +42,26 @@ class CrawledDocumentParquetRecordFileWriterTest { "hello world".getBytes(), null, null); + // Write the record to a file try (var writer = new CrawledDocumentParquetRecordFileWriter(tempFile)) { writer.write(original); } + // Read the file back var items = new ArrayList(); - try (var stream = new ParquetSerializableCrawlDataStream(tempFile)) { while (stream.hasNext()) { items.add(stream.next()); } } + // Verify the contents, we should have a domain and a document assertEquals(2, items.size()); + // Verify the domain var firstItem = items.get(0); assertInstanceOf(CrawledDomain.class, firstItem); + var domain = (CrawledDomain) firstItem; assertEquals("www.marginalia.nu", domain.domain); assertNull(domain.redirectDomain); @@ -65,6 +70,7 @@ class CrawledDocumentParquetRecordFileWriterTest { assertEquals(new ArrayList<>(), domain.doc); assertEquals(new ArrayList<>(), domain.cookies); + // Verify the document var secondItem = items.get(1); assertInstanceOf(CrawledDocument.class, secondItem); @@ -75,5 +81,31 @@ class CrawledDocumentParquetRecordFileWriterTest { assertEquals(200, document.httpStatus); } + // This is an inspection hatch test that reads a file from the odduck.neocities.org domain that didn't load properly, + // leaving as-is in case we need to look into other files in the future + @Test + public void testOdduck() { + Path testPath = Path.of("/home/vlofgren/Exports/22efad51-oddduck.neocities.org.parquet"); + + // Skip if the file doesn't exist + if (!Files.exists(testPath)) { + return; + } + + // Read the file + try (var stream = new ParquetSerializableCrawlDataStream(testPath)) { + while (stream.hasNext()) { + var item = stream.next(); + if (item instanceof CrawledDocument doc) { + System.out.println(doc.url); + System.out.println(doc.contentType); + System.out.println(doc.httpStatus); + System.out.println(doc.documentBody.length()); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + } } \ No newline at end of file diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index a570e72d..c74dd5fd 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -11,6 +11,10 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory; import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; +import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.process.log.WorkLog; +import nu.marginalia.process.log.WorkLogEntry; import nu.marginalia.service.ProcessMainClass; import nu.marginalia.storage.FileStorageService; import nu.marginalia.mq.MessageQueueFactory; @@ -23,11 +27,15 @@ import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.util.SimpleBlockingThreadPool; import nu.marginalia.worklog.BatchingWorkLog; import nu.marginalia.worklog.BatchingWorkLogImpl; -import plan.CrawlPlan; +import org.apache.logging.log4j.util.Strings; +import nu.marginalia.converting.model.CrawlPlan; import nu.marginalia.converting.processor.DomainProcessor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import nu.marginalia.converting.model.WorkDir; +import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.Collection; @@ -36,6 +44,7 @@ import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; import static nu.marginalia.mqapi.ProcessInboxNames.CONVERTER_INBOX; @@ -118,7 +127,8 @@ public class ConverterMain extends ProcessMainClass { } } - public void convert(CrawlPlan plan) throws Exception { + public void convert(int totalDomains, WorkDir crawlDir, WorkDir processedDir) throws Exception { + final int defaultPoolSize = Boolean.getBoolean("system.conserveMemory") ? Math.clamp(Runtime.getRuntime().availableProcessors() / 2, 1, 4) // <-- conserve memory @@ -126,12 +136,11 @@ public class ConverterMain extends ProcessMainClass { final int maxPoolSize = Integer.getInteger("converter.poolSize", defaultPoolSize); - try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(plan.process.getLogFile()); - ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, plan.process.getDir())) + try (BatchingWorkLog batchingWorkLog = new BatchingWorkLogImpl(processedDir.getLogFile()); + ConverterWriter converterWriter = new ConverterWriter(batchingWorkLog, processedDir.getDir())) { var pool = new SimpleBlockingThreadPool("ConverterThread", maxPoolSize, 2); - int totalDomains = plan.countCrawledDomains(); AtomicInteger processedDomains = new AtomicInteger(0); logger.info("Processing {} domains", totalDomains); @@ -139,7 +148,8 @@ public class ConverterMain extends ProcessMainClass { processedDomains.set(batchingWorkLog.size()); heartbeat.setProgress(processedDomains.get() / (double) totalDomains); - for (var domain : plan.crawlDataIterable(id -> !batchingWorkLog.isItemProcessed(id))) + for (var domain : WorkLog.iterableMap(crawlDir.getLogFile(), + new CrawlDataLocator(crawlDir.getDir(), batchingWorkLog))) { pool.submit(() -> { try { @@ -165,6 +175,52 @@ public class ConverterMain extends ProcessMainClass { } } + private static class CrawlDataLocator implements Function> { + + private final Path crawlRootDir; + private final BatchingWorkLog batchingWorkLog; + + CrawlDataLocator(Path crawlRootDir, BatchingWorkLog workLog) { + this.crawlRootDir = crawlRootDir; + this.batchingWorkLog = workLog; + } + + @Override + public Optional apply(WorkLogEntry entry) { + if (batchingWorkLog.isItemProcessed(entry.id())) { + return Optional.empty(); + } + + var path = getCrawledFilePath(crawlRootDir, entry.path()); + + if (!Files.exists(path)) { + logger.warn("File not found: {}", path); + return Optional.empty(); + } + + try { + return Optional.of(CrawledDomainReader.createDataStream(path)); + } + catch (IOException ex) { + return Optional.empty(); + } + } + + private Path getCrawledFilePath(Path crawlDir, String fileName) { + int sp = fileName.lastIndexOf('/'); + + // Normalize the filename + if (sp >= 0 && sp + 1< fileName.length()) + fileName = fileName.substring(sp + 1); + if (fileName.length() < 4) + fileName = Strings.repeat("0", 4 - fileName.length()) + fileName; + + String sp1 = fileName.substring(0, 2); + String sp2 = fileName.substring(2, 4); + return crawlDir.resolve(sp1).resolve(sp2).resolve(fileName); + } + } + private abstract static class ConvertRequest { private final MqMessage message; private final MqSingleShotInbox inbox; @@ -196,6 +252,7 @@ public class ConverterMain extends ProcessMainClass { this.sideloadSources = List.of(sideloadSource); this.workDir = workDir; } + SideloadAction(Collection sideloadSources, Path workDir, MqMessage message, MqSingleShotInbox inbox) { @@ -227,7 +284,7 @@ public class ConverterMain extends ProcessMainClass { @Override public void execute(ConverterMain converterMain) throws Exception { try { - converterMain.convert(plan); + converterMain.convert(plan.countCrawledDomains(), plan.crawl(), plan.process()); ok(); } catch (Exception ex) { @@ -256,8 +313,9 @@ public class ConverterMain extends ProcessMainClass { var processData = fileStorageService.getStorage(request.processedDataStorage); var plan = new CrawlPlan(null, - new CrawlPlan.WorkDir(crawlData.path(), "crawler.log"), - new CrawlPlan.WorkDir(processData.path(), "processor.log")); + new WorkDir(crawlData.path(), "crawler.log"), + new WorkDir(processData.path(), "processor.log") + ); yield new ConvertCrawlDataAction(plan, msg, inbox); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java b/code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java new file mode 100644 index 00000000..3b929039 --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/model/CrawlPlan.java @@ -0,0 +1,15 @@ +package nu.marginalia.converting.model; + +import nu.marginalia.process.log.WorkLog; + +public record CrawlPlan(String jobSpec, WorkDir crawl, WorkDir process) { + + public int countCrawledDomains() { + int count = 0; + for (var ignored : WorkLog.iterable(crawl.getLogFile())) { + count++; + } + return count; + } + +} diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java b/code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java new file mode 100644 index 00000000..2444aa2d --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/model/WorkDir.java @@ -0,0 +1,13 @@ +package nu.marginalia.converting.model; + +import java.nio.file.Path; + +public record WorkDir(String dir, String logName) { + public Path getDir() { + return Path.of(dir); + } + + public Path getLogFile() { + return Path.of(dir).resolve(logName); + } +} diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 4461a85a..be152d38 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -284,7 +284,7 @@ public class CrawlerMain extends ProcessMainClass { private CrawlDataReference getReference() { try { - return new CrawlDataReference(CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, outputDir, domain, id)); + return new CrawlDataReference(CrawledDomainReader.createDataStream(outputDir, domain, id)); } catch (IOException e) { logger.debug("Failed to read previous crawl data for {}", specification.domain); return new CrawlDataReference(); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index b1abf3e1..efae36aa 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -272,6 +272,7 @@ public class CrawlerRetreiver implements AutoCloseable { } } + // Parse the document and enqueue links try { if (fetchedDoc instanceof HttpFetchResult.ResultOk ok) { var docOpt = ok.parseDocument(); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 94494402..77dc6463 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -143,7 +143,6 @@ public class HttpFetcherImpl implements HttpFetcher { public HttpFetchResult fetchContent(EdgeUrl url, WarcRecorder warcRecorder, ContentTags contentTags) - throws RateLimitException { // We don't want to waste time and resources on URLs that are not HTML, so if the file ending diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index a21a06df..4c091302 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -1,5 +1,6 @@ package nu.marginalia.crawl.retreival.revisit; +import com.google.common.base.Strings; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.CrawlDelayTimer; @@ -48,23 +49,32 @@ public class CrawlerRevisitor { continue; var url = urlMaybe.get(); - // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again + // If we've previously 404:d on this URL, we'll refrain from trying to fetch it again, + // since it's likely to 404 again. It will be forgotten by the next crawl though, so + // we'll eventually try again. + if (doc.httpStatus == 404) { crawlFrontier.addVisited(url); continue; } + // If the reference document is empty or the HTTP status is not 200, we'll skip it since it's + // unlikely to produce anything meaningful for us. if (doc.httpStatus != 200) continue; + if (Strings.isNullOrEmpty(doc.documentBody)) + continue; + + if (!crawlFrontier.filterLink(url)) + continue; + + if (!crawlFrontier.addVisited(url)) + continue; if (!robotsRules.isAllowed(url.toString())) { warcRecorder.flagAsRobotsTxtError(url); continue; } - if (!crawlFrontier.filterLink(url)) - continue; - if (!crawlFrontier.addVisited(url)) - continue; if (recrawled > 5 @@ -79,10 +89,7 @@ public class CrawlerRevisitor { crawlFrontier.addVisited(url); // Hoover up any links from the document - if (doc.httpStatus == 200 && doc.documentBody != null) { - var parsedDoc = Jsoup.parse(doc.documentBody); - crawlFrontier.enqueueLinksFromDocument(url, parsedDoc); - } + crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody)); // Add a WARC record so we don't repeat this warcRecorder.writeReferenceCopy(url, @@ -97,7 +104,8 @@ public class CrawlerRevisitor { // providing etag and last-modified headers, so we can recycle the // document if it hasn't changed without actually downloading it - var reference = new DocumentWithReference(doc, oldCrawlData); + DocumentWithReference reference = new DocumentWithReference(doc, oldCrawlData); + var result = crawlerRetreiver.fetchWriteAndSleep(url, delayTimer, reference); if (reference.isSame(result)) { diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java deleted file mode 100644 index 086529d6..00000000 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/CrawlPlanLoaderTest.java +++ /dev/null @@ -1,51 +0,0 @@ -package nu.marginalia.crawling; - -import plan.CrawlPlanLoader; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class CrawlPlanLoaderTest { - - Path tempFile; - - @BeforeEach - public void setUp() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".yaml"); - } - @AfterEach - public void tearDown() throws IOException { - Files.delete(tempFile); - } - - @Test - void load() throws IOException { - Files.writeString(tempFile, """ - jobSpec: "job.spec" - crawl: - dir: "/foo" - logName: "foo.log" - process: - dir: "/bar" - logName: "bar.log" - """); - var loader = new CrawlPlanLoader(); - var ret = loader.load(tempFile); - - assertEquals(Path.of("job.spec"), ret.getJobSpec()); - - assertEquals(Path.of("/foo"), ret.crawl.getDir()); - assertEquals(Path.of("/foo/foo.log"), ret.crawl.getLogFile()); - - assertEquals(Path.of("/bar"), ret.process.getDir()); - assertEquals(Path.of("/bar/bar.log"), ret.process.getLogFile()); - - System.out.println(ret); - } -} \ No newline at end of file diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index 811200cc..aa1f00e7 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -183,7 +183,7 @@ class CrawlerRetreiverTest { convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDocument doc) { data.add(doc); @@ -236,7 +236,7 @@ class CrawlerRetreiverTest { convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDocument doc) { data.add(doc); @@ -284,7 +284,7 @@ class CrawlerRetreiverTest { doCrawl(tempFileWarc1, specs); convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { while (stream.hasNext()) { if (stream.next() instanceof CrawledDocument doc) { data.add(doc); @@ -331,7 +331,7 @@ class CrawlerRetreiverTest { doCrawl(tempFileWarc1, specs); convertToParquet(tempFileWarc1, tempFileParquet1); doCrawlWithReferenceStream(specs, - CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1) + CrawledDomainReader.createDataStream(tempFileParquet1) ); convertToParquet(tempFileWarc2, tempFileParquet2); @@ -352,7 +352,7 @@ class CrawlerRetreiverTest { }); } - try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) { + try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) { while (ds.hasNext()) { var doc = ds.next(); if (doc instanceof CrawledDomain dr) { @@ -395,7 +395,7 @@ class CrawlerRetreiverTest { convertToParquet(tempFileWarc1, tempFileParquet1); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1)) { + try (var stream = CrawledDomainReader.createDataStream(tempFileParquet1)) { while (stream.hasNext()) { var doc = stream.next(); data.computeIfAbsent(doc.getClass(), c -> new ArrayList<>()).add(doc); @@ -404,7 +404,7 @@ class CrawlerRetreiverTest { throw new RuntimeException(e); } - var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet1); + var stream = CrawledDomainReader.createDataStream(tempFileParquet1); System.out.println("---"); @@ -444,7 +444,7 @@ class CrawlerRetreiverTest { }); } - try (var ds = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.ANY, tempFileParquet2)) { + try (var ds = CrawledDomainReader.createDataStream(tempFileParquet2)) { while (ds.hasNext()) { var doc = ds.next(); if (doc instanceof CrawledDomain dr) { diff --git a/code/tools/crawl-data-unfcker/build.gradle b/code/tools/crawl-data-unfcker/build.gradle deleted file mode 100644 index 755fba5e..00000000 --- a/code/tools/crawl-data-unfcker/build.gradle +++ /dev/null @@ -1,57 +0,0 @@ -plugins { - id 'java' - - id 'application' - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) - } -} - -application { - mainClass = 'nu.marginalia.tools.CrawlDataUnfcker' - applicationName = 'crawl-data-unfcker' -} - -tasks.distZip.enabled = false - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation project(':third-party:rdrpostagger') - implementation project(':third-party:porterstemmer') - implementation project(':third-party:monkey-patch-opennlp') - implementation project(':code:common:model') - implementation project(':code:common:config') - implementation project(':code:common:process') - implementation project(':code:common:service') - implementation project(':code:libraries:language-processing') - implementation project(':code:libraries:term-frequency-dict') - implementation project(':code:libraries:big-string') - implementation project(':code:processes:converting-process') - implementation project(':code:process-models:crawling-model') - - implementation project(':code:features-convert:adblock') - implementation project(':code:features-convert:topic-detection') - implementation project(':code:features-convert:keyword-extraction') - - implementation libs.bundles.slf4j - implementation libs.notnull - - implementation libs.guice - implementation libs.jsoup - implementation libs.trove - implementation libs.fastutil - - implementation libs.bundles.nlp - implementation libs.commons.lang3 - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java b/code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java deleted file mode 100644 index 0101de12..00000000 --- a/code/tools/crawl-data-unfcker/java/nu/marginalia/tools/CrawlDataUnfcker.java +++ /dev/null @@ -1,75 +0,0 @@ -package nu.marginalia.tools; - -import nu.marginalia.crawling.io.CrawlerOutputFile; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.process.log.WorkLog; -import nu.marginalia.crawling.io.CrawledDomainReader; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Optional; - -public class CrawlDataUnfcker { - public static void main(String... args) { - if (args.length != 2) { - System.out.println("Usage: crawl-data-unfcker input output"); - return; - } - - Path input = Path.of(args[0]); - Path output = Path.of(args[1]); - - if (!Files.isDirectory(input)) { - System.err.println("Input directory is not valid"); - return; - } - if (!Files.isDirectory(output)) { - System.err.println("Output directory is not valid"); - return; - } - - try (var wl = new WorkLog(output.resolve("crawler.log"))) { - for (var inputItem : WorkLog.iterable(input.resolve("crawler.log"))) { - Path inputPath = input.resolve(inputItem.relPath()); - - var domainMaybe = readDomain(inputPath).map(CrawledDomain::getDomain); - if (domainMaybe.isEmpty()) - continue; - var domain = domainMaybe.get(); - - // Generate conformant ID - String newId = Integer.toHexString(domain.hashCode()); - - var outputPath = CrawlerOutputFile.createLegacyOutputPath(output, newId, domain); - var outputFileName = outputPath.toFile().getName(); - - System.out.println(inputPath + " -> " + outputPath); - Files.move(inputPath, outputPath); - - wl.setJobToFinished(domain, outputFileName, inputItem.cnt()); - } - } catch (IOException e) { - throw new RuntimeException(e); - } - } - - static Optional readDomain(Path file) { - if (!Files.exists(file)) { - System.out.println("Missing file " + file); - return Optional.empty(); - } - - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, file)) { - while (stream.hasNext()) { - if (stream.next() instanceof CrawledDomain domain) { - return Optional.of(domain); - } - } - } - catch (Exception ex) { - ex.printStackTrace(); - } - return Optional.empty(); - } -} diff --git a/code/tools/crawl-data-unfcker/readme.md b/code/tools/crawl-data-unfcker/readme.md deleted file mode 100644 index 9c870953..00000000 --- a/code/tools/crawl-data-unfcker/readme.md +++ /dev/null @@ -1,3 +0,0 @@ -# Crawl Data Unfcker - -This is a migration tool that patches the generated ID of crawl data. \ No newline at end of file diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java index d58bf778..668a25a9 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -48,7 +48,7 @@ public class ExperimentRunnerMain { Path basePath = Path.of(args[0]); for (var item : WorkLog.iterable(basePath.resolve("crawler.log"))) { Path crawlDataPath = basePath.resolve(item.relPath()); - try (var stream = CrawledDomainReader.createDataStream(CrawledDomainReader.CompatibilityLevel.FAST, crawlDataPath)) { + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { experiment.process(stream); } catch (Exception ex) { diff --git a/settings.gradle b/settings.gradle index 6571020c..13622d9c 100644 --- a/settings.gradle +++ b/settings.gradle @@ -94,7 +94,6 @@ include 'code:process-models:processed-data' include 'code:tools:experiment-runner' include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test' -include 'code:tools:crawl-data-unfcker' include 'third-party:porterstemmer' include 'third-party:symspell' From 6b88db10ad857be756b8099e4a3b45fa12536aa2 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 14:14:24 +0200 Subject: [PATCH 77/90] (crawler) Ensure all appropriate headers are recorded on the request --- .../warc/WarcProtocolReconstructor.java | 24 ++++++++++++++----- .../retreival/fetcher/warc/WarcRecorder.java | 6 ++++- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java index 6f977e44..b75589ee 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcProtocolReconstructor.java @@ -8,9 +8,7 @@ import org.apache.commons.lang3.StringUtils; import java.net.URI; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Map; -import java.util.StringJoiner; +import java.util.*; import java.util.stream.Collectors; /** We don't have access to the raw HTTP request and response, so we need to reconstruct them @@ -18,12 +16,15 @@ import java.util.stream.Collectors; */ public class WarcProtocolReconstructor { - static String getHttpRequestString(Request request, URI uri) { + static String getHttpRequestString(String method, + Map> mainHeaders, + Map> extraHeaders, + URI uri) { StringBuilder requestStringBuilder = new StringBuilder(); final String encodedURL = encodeURLKeepSlashes(uri.getPath()); - requestStringBuilder.append(request.method()).append(" ").append(encodedURL); + requestStringBuilder.append(method).append(" ").append(encodedURL); if (uri.getQuery() != null) { requestStringBuilder.append("?").append(URLEncoder.encode(uri.getQuery(), StandardCharsets.UTF_8)); @@ -31,12 +32,23 @@ public class WarcProtocolReconstructor { requestStringBuilder.append(" HTTP/1.1\r\n"); requestStringBuilder.append("Host: ").append(uri.getHost()).append("\r\n"); - request.headers().toMultimap().forEach((k, values) -> { + Set addedHeaders = new HashSet<>(); + + mainHeaders.forEach((k, values) -> { for (var value : values) { + addedHeaders.add(k); requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n"); } }); + extraHeaders.forEach((k, values) -> { + if (!addedHeaders.contains(k)) { + for (var value : values) { + requestStringBuilder.append(capitalizeHeader(k)).append(": ").append(value).append("\r\n"); + } + } + }); + return requestStringBuilder.toString(); } diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 23ab4766..180811cf 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -146,7 +146,11 @@ public class WarcRecorder implements AutoCloseable { WarcDigestBuilder requestDigestBuilder = new WarcDigestBuilder(); byte[] httpRequestString = WarcProtocolReconstructor - .getHttpRequestString(response.request(), requestUri) + .getHttpRequestString( + response.request().method(), + response.request().headers().toMultimap(), + request.headers().toMultimap(), + requestUri) .getBytes(); requestDigestBuilder.update(httpRequestString); From c9f029c214c3e1f84cd5fbaed25a6523318d0279 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 14:31:05 +0200 Subject: [PATCH 78/90] (crawler) Strip W/-prefix from the etag when supplied as If-None-Match --- .../crawl/retreival/fetcher/ContentTags.java | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java index e1df86c8..13da0975 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java @@ -18,7 +18,38 @@ public record ContentTags(String etag, String lastMod) { /** Paints the tags onto the request builder. */ public void paint(Request.Builder getBuilder) { - if (etag != null) getBuilder.addHeader("If-None-Match", etag); - if (lastMod != null) getBuilder.addHeader("If-Modified-Since", lastMod); + + System.out.println(ifNoneMatch() + " " + ifModifiedSince()); + + if (etag != null) { + getBuilder.addHeader("If-None-Match", ifNoneMatch()); + } + + if (lastMod != null) { + getBuilder.addHeader("If-Modified-Since", ifModifiedSince()); + } + } + + private String ifNoneMatch() { + // Remove the W/ prefix if it exists + + //'W/' (case-sensitive) indicates that a weak validator is used. Weak etags are + // easy to generate, but are far less useful for comparisons. Strong validators + // are ideal for comparisons but can be very difficult to generate efficiently. + // Weak ETag values of two representations of the same resources might be semantically + // equivalent, but not byte-for-byte identical. This means weak etags prevent caching + // when byte range requests are used, but strong etags mean range requests can + // still be cached. + // - https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/ETag + + if (null != etag && etag.startsWith("W/")) { + return etag.substring(2); + } else { + return etag; + } + } + + private String ifModifiedSince() { + return lastMod; } } From 6dd87b037862a2bd221f35c3113317c39e882b97 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 15:36:43 +0200 Subject: [PATCH 79/90] (crawler) Use the probe-result to reduce the likelihood of crawling both http and https This should drastically reduce the number of fetched documents on many domains --- .../crawl/retreival/CrawlerRetreiver.java | 20 ++++++------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index efae36aa..98409c01 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -72,11 +72,6 @@ public class CrawlerRetreiver implements AutoCloseable { crawlFrontier.addFirst(root); } - else { - // We know nothing about this domain, so we'll start with the index, trying both HTTP and HTTPS - crawlFrontier.addToQueue(new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); - crawlFrontier.addToQueue(new EdgeUrl("https", new EdgeDomain(domain), null, "/", null)); - } } // For testing @@ -89,7 +84,10 @@ public class CrawlerRetreiver implements AutoCloseable { } public int fetch(DomainLinks domainLinks, CrawlDataReference oldCrawlData) { - final DomainProber.ProbeResult probeResult = domainProber.probeDomain(fetcher, domain, crawlFrontier.peek()); + final DomainProber.ProbeResult probeResult = domainProber.probeDomain( + fetcher, + domain, + new EdgeUrl("http", new EdgeDomain(domain), null, "/", null)); try { return crawlDomain(oldCrawlData, probeResult, domainLinks); @@ -108,7 +106,6 @@ public class CrawlerRetreiver implements AutoCloseable { private int crawlDomain(CrawlDataReference oldCrawlData, DomainProber.ProbeResult probeResult, DomainLinks domainLinks) throws IOException, InterruptedException { String ip = findIp(domain); - EdgeUrl rootUrl; warcRecorder.writeWarcinfoHeader(ip, new EdgeDomain(domain), probeResult); @@ -123,13 +120,13 @@ public class CrawlerRetreiver implements AutoCloseable { assert !crawlFrontier.isEmpty(); - final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(crawlFrontier.peek().domain, warcRecorder); + final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); sniffRootDocument(rootUrl); // Play back the old crawl data (if present) and fetch the documents comparing etags and last-modified - int recrawled = recrawl(oldCrawlData, robotsRules, delayTimer); + int recrawled = crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); if (recrawled > 0) { // If we have reference data, we will always grow the crawl depth a bit @@ -199,11 +196,6 @@ public class CrawlerRetreiver implements AutoCloseable { return fetchedCount; } - /** Using the old crawl data, fetch the documents comparing etags and last-modified */ - private int recrawl(CrawlDataReference oldCrawlData, SimpleRobotRules robotsRules, CrawlDelayTimer delayTimer) throws InterruptedException { - return crawlerRevisitor.recrawl(oldCrawlData, robotsRules, delayTimer); - } - private void sniffRootDocument(EdgeUrl rootUrl) { try { logger.debug("Configuring link filter"); From a86b59689736cf804f368db468b5251f06a3f36e Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 15:37:35 +0200 Subject: [PATCH 80/90] (crawler) Code quality --- .../java/nu/marginalia/crawl/CrawlerMain.java | 10 ++++++---- .../crawl/retreival/revisit/CrawlerRevisitor.java | 2 -- .../java/nu/marginalia/crawl/warc/WarcArchiverIf.java | 1 + 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index be152d38..1b04c0f9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -69,11 +69,11 @@ public class CrawlerMain extends ProcessMainClass { private final Map processingIds = new ConcurrentHashMap<>(); - final AbortMonitor abortMonitor = AbortMonitor.getInstance(); + private final AbortMonitor abortMonitor = AbortMonitor.getInstance(); + private final AtomicInteger tasksDone = new AtomicInteger(0); + private final HttpFetcherImpl fetcher; - volatile int totalTasks; - final AtomicInteger tasksDone = new AtomicInteger(0); - private HttpFetcherImpl fetcher; + private volatile int totalTasks; @Inject public CrawlerMain(UserAgent userAgent, @@ -263,6 +263,8 @@ public class CrawlerMain extends ProcessMainClass { CrawledDocumentParquetRecordFileWriter .convertWarc(domain, userAgent, newWarcFile, parquetFile); + // Optionally archive the WARC file if full retention is enabled, + // otherwise delete it: warcArchiver.consumeWarc(newWarcFile, domain); workLog.setJobToFinished(domain, parquetFile.toString(), size); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 4c091302..55dbb3c2 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -86,8 +86,6 @@ public class CrawlerRevisitor { // fashion to make sure we eventually catch changes over time // and ensure we discover new links - crawlFrontier.addVisited(url); - // Hoover up any links from the document crawlFrontier.enqueueLinksFromDocument(url, Jsoup.parse(doc.documentBody)); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java b/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java index 80e64d7a..cc9eb8e8 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/warc/WarcArchiverIf.java @@ -3,6 +3,7 @@ package nu.marginalia.crawl.warc; import java.io.IOException; import java.nio.file.Path; +/** Interface for archiving warc files. */ public interface WarcArchiverIf extends AutoCloseable { /** Process the warc file. After processing, the warc file is deleted. * Processing may be a no-op, depending on the implementation. From f430a084e878d635ee10c339aa39300eb7191235 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 15:51:37 +0200 Subject: [PATCH 81/90] (crawler) Remove accidental log spam --- .../java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java index 13da0975..a3b3a2bc 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTags.java @@ -19,8 +19,6 @@ public record ContentTags(String etag, String lastMod) { /** Paints the tags onto the request builder. */ public void paint(Request.Builder getBuilder) { - System.out.println(ifNoneMatch() + " " + ifModifiedSince()); - if (etag != null) { getBuilder.addHeader("If-None-Match", ifNoneMatch()); } From e1c93133968e667063b4d6054fd567f80fdebb67 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Mon, 22 Apr 2024 17:26:31 +0200 Subject: [PATCH 82/90] (crawler) Emulate if-modified-since for domains that don't support the header This will help reduce the strain on some server software, in particular Discourse. --- .../retreival/fetcher/HttpFetcherImpl.java | 10 ++++ .../fetcher/SoftIfModifiedSinceProber.java | 49 +++++++++++++++++++ 2 files changed, 59 insertions(+) create mode 100644 code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 77dc6463..da7ddd3e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -42,6 +42,7 @@ public class HttpFetcherImpl implements HttpFetcher { private static final ContentTypeLogic contentTypeLogic = new ContentTypeLogic(); private final ContentTypeProber contentTypeProber; + private final SoftIfModifiedSinceProber softIfModifiedSinceProber; @Override public void setAllowAllContentTypes(boolean allowAllContentTypes) { @@ -93,6 +94,7 @@ public class HttpFetcherImpl implements HttpFetcher { this.userAgentString = userAgent.uaString(); this.userAgentIdentifier = userAgent.uaIdentifier(); this.contentTypeProber = new ContentTypeProber(userAgentString, client); + this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgentString, client); } public HttpFetcherImpl(String userAgent) { @@ -100,6 +102,7 @@ public class HttpFetcherImpl implements HttpFetcher { this.userAgentString = userAgent; this.userAgentIdentifier = userAgent; this.contentTypeProber = new ContentTypeProber(userAgent, client); + this.softIfModifiedSinceProber = new SoftIfModifiedSinceProber(userAgent, client); } /** @@ -166,6 +169,13 @@ public class HttpFetcherImpl implements HttpFetcher { return new HttpFetchResult.ResultNone(); } } + else { + // Possibly do a soft probe to see if the URL has been modified since the last time we crawled it + // if we have reason to suspect ETags are not supported by the server. + if (softIfModifiedSinceProber.probeModificationTime(url, contentTags)) { + return new HttpFetchResult.Result304Raw(); + } + } var getBuilder = new Request.Builder().get(); diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java new file mode 100644 index 00000000..238e8944 --- /dev/null +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java @@ -0,0 +1,49 @@ +package nu.marginalia.crawl.retreival.fetcher; + +import com.google.common.base.Strings; +import nu.marginalia.model.EdgeUrl; +import okhttp3.OkHttpClient; +import okhttp3.Request; + +import java.io.IOException; +import java.util.Objects; + +public class SoftIfModifiedSinceProber { + + private final String userAgentString; + private final OkHttpClient client; + + public SoftIfModifiedSinceProber(String userAgentString, OkHttpClient httpClient) { + this.userAgentString = userAgentString; + this.client = httpClient; + } + + /** Implement a soft probe of the last modified time of the given URL with a HEAD request. + * This is used to detect if the URL has been modified since the last time we crawled it. + */ + public boolean probeModificationTime(EdgeUrl url, ContentTags tags) throws IOException { + var headBuilder = new Request.Builder().head() + .addHeader("User-agent", userAgentString) + .addHeader("Accept-Encoding", "gzip") + .url(url.toString()); + + // This logic is only applicable if we only have a last-modified time, but no ETag. + if (Strings.isNullOrEmpty(tags.lastMod())) + return false; + if (!Strings.isNullOrEmpty(tags.etag())) + return false; + + var head = headBuilder.build(); + var call = client.newCall(head); + + try (var rsp = call.execute()) { + if (rsp.code() != 200) { + return false; + } + + var contentTypeHeader = rsp.header("Last-Modified"); + return Objects.equals(contentTypeHeader, tags.lastMod()); + } + } + +} From 32fe864a33d52e4177cfa3fcab58b03b6aba8366 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 24 Apr 2024 13:54:04 +0200 Subject: [PATCH 83/90] (build) Java 22 and its consequences has been a disaster for Marginalia Search Roll back to JDK 21 for now, and make Java version configurable in the root build.gradle The project has run into no less than three distinct show-stopping bugs in JDK22, across multiple vendors, and gradle still doesn't fully support it, meaning you need multiple JDK versions installed. --- build.gradle | 6 ++++-- code/common/config/build.gradle | 2 +- code/common/db/build.gradle | 2 +- code/common/linkdb/build.gradle | 2 +- code/common/model/build.gradle | 2 +- code/common/process/build.gradle | 2 +- code/common/renderer/build.gradle | 2 +- code/common/service/build.gradle | 2 +- code/execution/api/build.gradle | 2 +- code/execution/build.gradle | 2 +- code/features-convert/adblock/build.gradle | 2 +- code/features-convert/anchor-keywords/build.gradle | 2 +- code/features-convert/data-extractors/build.gradle | 2 +- code/features-convert/keyword-extraction/build.gradle | 2 +- code/features-convert/pubdate/build.gradle | 2 +- code/features-convert/reddit-json/build.gradle | 2 +- code/features-convert/stackexchange-xml/build.gradle | 2 +- code/features-convert/summary-extraction/build.gradle | 2 +- code/features-convert/topic-detection/build.gradle | 2 +- code/features-crawl/content-type/build.gradle | 2 +- code/features-crawl/crawl-blocklist/build.gradle | 2 +- code/features-crawl/link-parser/build.gradle | 2 +- code/features-search/feedlot-client/build.gradle | 2 +- code/features-search/random-websites/build.gradle | 2 +- code/features-search/screenshots/build.gradle | 2 +- code/functions/domain-info/api/build.gradle | 2 +- code/functions/domain-info/build.gradle | 2 +- code/functions/link-graph/aggregate/build.gradle | 2 +- code/functions/link-graph/api/build.gradle | 2 +- code/functions/link-graph/partition/build.gradle | 2 +- code/functions/math/api/build.gradle | 2 +- code/functions/math/build.gradle | 2 +- code/functions/search-query/api/build.gradle | 2 +- code/functions/search-query/build.gradle | 2 +- code/index/api/build.gradle | 2 +- code/index/build.gradle | 2 +- code/index/index-forward/build.gradle | 2 +- code/index/index-journal/build.gradle | 2 +- code/index/index-reverse/build.gradle | 2 +- code/index/query/build.gradle | 2 +- code/libraries/array/build.gradle | 4 ++-- code/libraries/big-string/build.gradle | 2 +- code/libraries/blocking-thread-pool/build.gradle | 2 +- code/libraries/braille-block-punch-cards/build.gradle | 2 +- code/libraries/btree/build.gradle | 2 +- code/libraries/easy-lsh/build.gradle | 2 +- code/libraries/geo-ip/build.gradle | 2 +- code/libraries/guarded-regex/build.gradle | 2 +- code/libraries/language-processing/build.gradle | 2 +- code/libraries/message-queue/build.gradle | 2 +- code/libraries/next-prime/build.gradle | 2 +- code/libraries/random-write-funnel/build.gradle | 2 +- code/libraries/term-frequency-dict/build.gradle | 2 +- code/libraries/test-helpers/build.gradle | 2 +- code/process-models/crawl-spec/build.gradle | 2 +- code/process-models/crawling-model/build.gradle | 2 +- code/process-models/processed-data/build.gradle | 2 +- code/process-models/work-log/build.gradle | 2 +- code/process-mqapi/build.gradle | 2 +- code/processes/converting-process/build.gradle | 2 +- code/processes/crawling-process/build.gradle | 2 +- code/processes/index-constructor-process/build.gradle | 2 +- code/processes/loading-process/build.gradle | 2 +- code/processes/test-data/build.gradle | 2 +- code/processes/website-adjacencies-calculator/build.gradle | 2 +- code/services-application/api-service/build.gradle | 2 +- code/services-application/dating-service/build.gradle | 2 +- code/services-application/explorer-service/build.gradle | 2 +- code/services-application/search-service/build.gradle | 2 +- code/services-core/assistant-service/build.gradle | 2 +- code/services-core/control-service/build.gradle | 2 +- code/services-core/executor-service/build.gradle | 2 +- code/services-core/index-service/build.gradle | 2 +- code/services-core/query-service/build.gradle | 2 +- code/tools/experiment-runner/build.gradle | 2 +- code/tools/load-test/build.gradle | 2 +- code/tools/screenshot-capture-tool/build.gradle | 2 +- run/readme.md | 2 +- third-party/commons-codec/build.gradle | 2 +- third-party/count-min-sketch/build.gradle | 2 +- third-party/encyclopedia-marginalia-nu/build.gradle | 2 +- third-party/monkey-patch-opennlp/build.gradle | 2 +- third-party/openzim/build.gradle | 2 +- third-party/parquet-floor/build.gradle | 2 +- third-party/porterstemmer/build.gradle | 2 +- third-party/rdrpostagger/build.gradle | 2 +- third-party/symspell/build.gradle | 2 +- 87 files changed, 91 insertions(+), 89 deletions(-) diff --git a/build.gradle b/build.gradle index 2b35ee4e..dad52fa3 100644 --- a/build.gradle +++ b/build.gradle @@ -42,8 +42,10 @@ subprojects.forEach {it -> } } + ext { - dockerImageBase='container-registry.oracle.com/graalvm/jdk:22@sha256:22d2ca0d4fb378f50306ec2fda3178cce4523c4fe64e869108571c3c6e7026c8\n' + jvmVersion=21 + dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b' dockerImageTag='latest' dockerImageRegistry='marginalia' } @@ -66,7 +68,7 @@ idea { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/common/config/build.gradle b/code/common/config/build.gradle index 74fdf702..6dc92737 100644 --- a/code/common/config/build.gradle +++ b/code/common/config/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index 7e85c3de..48bbb85a 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -17,7 +17,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/common/linkdb/build.gradle b/code/common/linkdb/build.gradle index 14c7f056..6f544589 100644 --- a/code/common/linkdb/build.gradle +++ b/code/common/linkdb/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/common/model/build.gradle b/code/common/model/build.gradle index 1e6c7566..26e23395 100644 --- a/code/common/model/build.gradle +++ b/code/common/model/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/common/process/build.gradle b/code/common/process/build.gradle index 908bfae1..d242281b 100644 --- a/code/common/process/build.gradle +++ b/code/common/process/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/common/renderer/build.gradle b/code/common/renderer/build.gradle index fa79e153..354363e4 100644 --- a/code/common/renderer/build.gradle +++ b/code/common/renderer/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index 57342fa1..24cdb3a6 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/execution/api/build.gradle b/code/execution/api/build.gradle index 02e8100e..c828b014 100644 --- a/code/execution/api/build.gradle +++ b/code/execution/api/build.gradle @@ -8,7 +8,7 @@ jar.archiveBaseName = 'execution-api' java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 3824a8c1..cfd29a8d 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/adblock/build.gradle b/code/features-convert/adblock/build.gradle index 11a7c5ea..612c4fbb 100644 --- a/code/features-convert/adblock/build.gradle +++ b/code/features-convert/adblock/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/features-convert/anchor-keywords/build.gradle index ae92b066..3fe67739 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/features-convert/anchor-keywords/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/data-extractors/build.gradle b/code/features-convert/data-extractors/build.gradle index 69ae1388..329f592e 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/features-convert/data-extractors/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/keyword-extraction/build.gradle b/code/features-convert/keyword-extraction/build.gradle index c63fc263..3d4b3507 100644 --- a/code/features-convert/keyword-extraction/build.gradle +++ b/code/features-convert/keyword-extraction/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/pubdate/build.gradle b/code/features-convert/pubdate/build.gradle index 1a33a4a7..43712d44 100644 --- a/code/features-convert/pubdate/build.gradle +++ b/code/features-convert/pubdate/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/reddit-json/build.gradle b/code/features-convert/reddit-json/build.gradle index afbc6961..2303e985 100644 --- a/code/features-convert/reddit-json/build.gradle +++ b/code/features-convert/reddit-json/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/stackexchange-xml/build.gradle b/code/features-convert/stackexchange-xml/build.gradle index bda05817..7f6d72c3 100644 --- a/code/features-convert/stackexchange-xml/build.gradle +++ b/code/features-convert/stackexchange-xml/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/summary-extraction/build.gradle b/code/features-convert/summary-extraction/build.gradle index 189b317b..c0b62102 100644 --- a/code/features-convert/summary-extraction/build.gradle +++ b/code/features-convert/summary-extraction/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-convert/topic-detection/build.gradle b/code/features-convert/topic-detection/build.gradle index 622d422b..e6c0a19e 100644 --- a/code/features-convert/topic-detection/build.gradle +++ b/code/features-convert/topic-detection/build.gradle @@ -9,7 +9,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-crawl/content-type/build.gradle b/code/features-crawl/content-type/build.gradle index 16ecddd1..a8c8ded6 100644 --- a/code/features-crawl/content-type/build.gradle +++ b/code/features-crawl/content-type/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/features-crawl/crawl-blocklist/build.gradle index 98741b80..8ebc19fe 100644 --- a/code/features-crawl/crawl-blocklist/build.gradle +++ b/code/features-crawl/crawl-blocklist/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-crawl/link-parser/build.gradle b/code/features-crawl/link-parser/build.gradle index f69a255b..751f5a3b 100644 --- a/code/features-crawl/link-parser/build.gradle +++ b/code/features-crawl/link-parser/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-search/feedlot-client/build.gradle b/code/features-search/feedlot-client/build.gradle index ef42210b..d7a430e9 100644 --- a/code/features-search/feedlot-client/build.gradle +++ b/code/features-search/feedlot-client/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-search/random-websites/build.gradle b/code/features-search/random-websites/build.gradle index fb0dd3ed..34d9744b 100644 --- a/code/features-search/random-websites/build.gradle +++ b/code/features-search/random-websites/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/features-search/screenshots/build.gradle b/code/features-search/screenshots/build.gradle index 54eb6542..f7fb1b1a 100644 --- a/code/features-search/screenshots/build.gradle +++ b/code/features-search/screenshots/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/domain-info/api/build.gradle b/code/functions/domain-info/api/build.gradle index 3ac3428e..5ea26173 100644 --- a/code/functions/domain-info/api/build.gradle +++ b/code/functions/domain-info/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/domain-info/build.gradle b/code/functions/domain-info/build.gradle index c968b0ed..d41c2875 100644 --- a/code/functions/domain-info/build.gradle +++ b/code/functions/domain-info/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/link-graph/aggregate/build.gradle b/code/functions/link-graph/aggregate/build.gradle index 213790b9..1fe842f0 100644 --- a/code/functions/link-graph/aggregate/build.gradle +++ b/code/functions/link-graph/aggregate/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/link-graph/api/build.gradle b/code/functions/link-graph/api/build.gradle index a16163b2..1573f1a8 100644 --- a/code/functions/link-graph/api/build.gradle +++ b/code/functions/link-graph/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/link-graph/partition/build.gradle b/code/functions/link-graph/partition/build.gradle index 766ed56c..34592680 100644 --- a/code/functions/link-graph/partition/build.gradle +++ b/code/functions/link-graph/partition/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/math/api/build.gradle b/code/functions/math/api/build.gradle index 90c536b0..760b2dee 100644 --- a/code/functions/math/api/build.gradle +++ b/code/functions/math/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/math/build.gradle b/code/functions/math/build.gradle index 814f57bc..93c518b8 100644 --- a/code/functions/math/build.gradle +++ b/code/functions/math/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index 1a8d55d2..424ba97d 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/functions/search-query/build.gradle b/code/functions/search-query/build.gradle index 7b792b48..7e7d46b1 100644 --- a/code/functions/search-query/build.gradle +++ b/code/functions/search-query/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/index/api/build.gradle b/code/index/api/build.gradle index d07a24eb..7074052e 100644 --- a/code/index/api/build.gradle +++ b/code/index/api/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/index/build.gradle b/code/index/build.gradle index 37275b0a..4fc07aa8 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index 96526205..cf453e73 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/index/index-journal/build.gradle b/code/index/index-journal/build.gradle index 988ce618..5380c0be 100644 --- a/code/index/index-journal/build.gradle +++ b/code/index/index-journal/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/index/index-reverse/build.gradle b/code/index/index-reverse/build.gradle index 36367546..bd46b3a0 100644 --- a/code/index/index-reverse/build.gradle +++ b/code/index/index-reverse/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/index/query/build.gradle b/code/index/query/build.gradle index 615d9fb7..bd819f3a 100644 --- a/code/index/query/build.gradle +++ b/code/index/query/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/array/build.gradle b/code/libraries/array/build.gradle index d7858a21..2e1b8051 100644 --- a/code/libraries/array/build.gradle +++ b/code/libraries/array/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } @@ -30,7 +30,7 @@ jmh { } tasks.withType(me.champeau.jmh.WithJavaToolchain).configureEach { javaLauncher.set(javaToolchains.launcherFor { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) }) } tasks.withType(me.champeau.jmh.JmhBytecodeGeneratorTask).configureEach { diff --git a/code/libraries/big-string/build.gradle b/code/libraries/big-string/build.gradle index c6d4c00f..4b5e0df1 100644 --- a/code/libraries/big-string/build.gradle +++ b/code/libraries/big-string/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/blocking-thread-pool/build.gradle b/code/libraries/blocking-thread-pool/build.gradle index 0a513f92..425d2c12 100644 --- a/code/libraries/blocking-thread-pool/build.gradle +++ b/code/libraries/blocking-thread-pool/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/braille-block-punch-cards/build.gradle b/code/libraries/braille-block-punch-cards/build.gradle index d6b8c6e6..5acf065c 100644 --- a/code/libraries/braille-block-punch-cards/build.gradle +++ b/code/libraries/braille-block-punch-cards/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/btree/build.gradle b/code/libraries/btree/build.gradle index 37060dd5..bdfb803d 100644 --- a/code/libraries/btree/build.gradle +++ b/code/libraries/btree/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/easy-lsh/build.gradle b/code/libraries/easy-lsh/build.gradle index 6c66bdde..0279e426 100644 --- a/code/libraries/easy-lsh/build.gradle +++ b/code/libraries/easy-lsh/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/geo-ip/build.gradle b/code/libraries/geo-ip/build.gradle index 4fd467aa..b64b911a 100644 --- a/code/libraries/geo-ip/build.gradle +++ b/code/libraries/geo-ip/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/guarded-regex/build.gradle b/code/libraries/guarded-regex/build.gradle index f310116e..8cfa6060 100644 --- a/code/libraries/guarded-regex/build.gradle +++ b/code/libraries/guarded-regex/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/language-processing/build.gradle b/code/libraries/language-processing/build.gradle index cc745397..57f053fe 100644 --- a/code/libraries/language-processing/build.gradle +++ b/code/libraries/language-processing/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/message-queue/build.gradle b/code/libraries/message-queue/build.gradle index d2618d95..240dd6c1 100644 --- a/code/libraries/message-queue/build.gradle +++ b/code/libraries/message-queue/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/next-prime/build.gradle b/code/libraries/next-prime/build.gradle index 0a513f92..425d2c12 100644 --- a/code/libraries/next-prime/build.gradle +++ b/code/libraries/next-prime/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/random-write-funnel/build.gradle b/code/libraries/random-write-funnel/build.gradle index a7acb1fa..39479864 100644 --- a/code/libraries/random-write-funnel/build.gradle +++ b/code/libraries/random-write-funnel/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/term-frequency-dict/build.gradle b/code/libraries/term-frequency-dict/build.gradle index 3a9a4d8d..ac06134b 100644 --- a/code/libraries/term-frequency-dict/build.gradle +++ b/code/libraries/term-frequency-dict/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/libraries/test-helpers/build.gradle b/code/libraries/test-helpers/build.gradle index 875e636d..d8dac98f 100644 --- a/code/libraries/test-helpers/build.gradle +++ b/code/libraries/test-helpers/build.gradle @@ -6,7 +6,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/process-models/crawl-spec/build.gradle b/code/process-models/crawl-spec/build.gradle index 70850445..2737e54a 100644 --- a/code/process-models/crawl-spec/build.gradle +++ b/code/process-models/crawl-spec/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/process-models/crawling-model/build.gradle b/code/process-models/crawling-model/build.gradle index 5926e03d..bb789778 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/process-models/crawling-model/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/process-models/processed-data/build.gradle b/code/process-models/processed-data/build.gradle index 9668d0b8..04ee95de 100644 --- a/code/process-models/processed-data/build.gradle +++ b/code/process-models/processed-data/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/process-models/work-log/build.gradle b/code/process-models/work-log/build.gradle index c56174ca..76fe01f9 100644 --- a/code/process-models/work-log/build.gradle +++ b/code/process-models/work-log/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/process-mqapi/build.gradle b/code/process-mqapi/build.gradle index 514ca034..339c52c8 100644 --- a/code/process-mqapi/build.gradle +++ b/code/process-mqapi/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 20532994..28d68edd 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 6ed789c6..f309bf92 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index ccec9b30..c22e158e 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index c396c52f..86c79095 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -6,7 +6,7 @@ plugins { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/processes/test-data/build.gradle b/code/processes/test-data/build.gradle index 4c2fef49..9cda7ad9 100644 --- a/code/processes/test-data/build.gradle +++ b/code/processes/test-data/build.gradle @@ -5,7 +5,7 @@ plugins { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/processes/website-adjacencies-calculator/build.gradle b/code/processes/website-adjacencies-calculator/build.gradle index 63e56286..6d990d14 100644 --- a/code/processes/website-adjacencies-calculator/build.gradle +++ b/code/processes/website-adjacencies-calculator/build.gradle @@ -6,7 +6,7 @@ plugins { } java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/services-application/api-service/build.gradle b/code/services-application/api-service/build.gradle index cb851a67..cbc26c3c 100644 --- a/code/services-application/api-service/build.gradle +++ b/code/services-application/api-service/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/services-application/dating-service/build.gradle b/code/services-application/dating-service/build.gradle index b574c1f8..95f2c4fb 100644 --- a/code/services-application/dating-service/build.gradle +++ b/code/services-application/dating-service/build.gradle @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/services-application/explorer-service/build.gradle b/code/services-application/explorer-service/build.gradle index cbea1f2c..c02ff9de 100644 --- a/code/services-application/explorer-service/build.gradle +++ b/code/services-application/explorer-service/build.gradle @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index 54622609..5fae4e58 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -18,7 +18,7 @@ tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } sass { diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index d1550bcb..283e7804 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -15,7 +15,7 @@ tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index 56c2be91..ae793b89 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 77b41a9e..96adbc30 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -15,7 +15,7 @@ tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 8a07c91a..23b942d1 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } apply from: "$rootProject.projectDir/srcsets.gradle" diff --git a/code/services-core/query-service/build.gradle b/code/services-core/query-service/build.gradle index 11f159bc..8feeb5c5 100644 --- a/code/services-core/query-service/build.gradle +++ b/code/services-core/query-service/build.gradle @@ -18,7 +18,7 @@ apply from: "$rootProject.projectDir/docker.gradle" java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index d2cbc29b..58eca872 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/tools/load-test/build.gradle b/code/tools/load-test/build.gradle index ca14347e..5f85c685 100644 --- a/code/tools/load-test/build.gradle +++ b/code/tools/load-test/build.gradle @@ -7,7 +7,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/code/tools/screenshot-capture-tool/build.gradle b/code/tools/screenshot-capture-tool/build.gradle index e2579be7..82be9729 100644 --- a/code/tools/screenshot-capture-tool/build.gradle +++ b/code/tools/screenshot-capture-tool/build.gradle @@ -8,7 +8,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/run/readme.md b/run/readme.md index 5d87e93f..0a890feb 100644 --- a/run/readme.md +++ b/run/readme.md @@ -11,7 +11,7 @@ documentation. **Docker** - It is a bit of a pain to install, but if you follow [this guide](https://docs.docker.com/engine/install/ubuntu/#install-using-the-repository) you're on the right track for ubuntu-like systems. -**JDK 22** - The code uses Java 22 preview features. +**JDK 21** - The code uses Java 21 preview features. The civilized way of installing this is to use [SDKMAN](https://sdkman.io/); graalce is a good distribution choice but it doesn't matter too much. diff --git a/third-party/commons-codec/build.gradle b/third-party/commons-codec/build.gradle index d5974fb9..c6e072bd 100644 --- a/third-party/commons-codec/build.gradle +++ b/third-party/commons-codec/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/count-min-sketch/build.gradle b/third-party/count-min-sketch/build.gradle index 52becdd0..fc721edc 100644 --- a/third-party/count-min-sketch/build.gradle +++ b/third-party/count-min-sketch/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/encyclopedia-marginalia-nu/build.gradle b/third-party/encyclopedia-marginalia-nu/build.gradle index f104b712..2f3b4e94 100644 --- a/third-party/encyclopedia-marginalia-nu/build.gradle +++ b/third-party/encyclopedia-marginalia-nu/build.gradle @@ -5,7 +5,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/monkey-patch-opennlp/build.gradle b/third-party/monkey-patch-opennlp/build.gradle index a8aa4366..5bfe52bc 100644 --- a/third-party/monkey-patch-opennlp/build.gradle +++ b/third-party/monkey-patch-opennlp/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/openzim/build.gradle b/third-party/openzim/build.gradle index 12a35aa1..9d5e2cdd 100644 --- a/third-party/openzim/build.gradle +++ b/third-party/openzim/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/parquet-floor/build.gradle b/third-party/parquet-floor/build.gradle index 0e9ed00e..7b0de520 100644 --- a/third-party/parquet-floor/build.gradle +++ b/third-party/parquet-floor/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/porterstemmer/build.gradle b/third-party/porterstemmer/build.gradle index 52becdd0..fc721edc 100644 --- a/third-party/porterstemmer/build.gradle +++ b/third-party/porterstemmer/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/rdrpostagger/build.gradle b/third-party/rdrpostagger/build.gradle index 1b076c14..38186ce5 100644 --- a/third-party/rdrpostagger/build.gradle +++ b/third-party/rdrpostagger/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } diff --git a/third-party/symspell/build.gradle b/third-party/symspell/build.gradle index 52becdd0..fc721edc 100644 --- a/third-party/symspell/build.gradle +++ b/third-party/symspell/build.gradle @@ -4,7 +4,7 @@ plugins { java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } From 91a98a88076d6c70aee0af9ecc53957c0d02185a Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 24 Apr 2024 14:10:03 +0200 Subject: [PATCH 84/90] (crawler) Reduce log noise from timeouts in SoftIfModifiedSinceProber --- .../crawl/retreival/fetcher/SoftIfModifiedSinceProber.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java index 238e8944..7b6071e9 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/SoftIfModifiedSinceProber.java @@ -6,6 +6,7 @@ import okhttp3.OkHttpClient; import okhttp3.Request; import java.io.IOException; +import java.net.SocketTimeoutException; import java.util.Objects; public class SoftIfModifiedSinceProber { @@ -44,6 +45,9 @@ public class SoftIfModifiedSinceProber { var contentTypeHeader = rsp.header("Last-Modified"); return Objects.equals(contentTypeHeader, tags.lastMod()); } + catch (SocketTimeoutException e) { // suppress timeout exceptions to reduce log noise + return false; + } } } From 282022d64ec83bc06c8edaa24bef059341c873a3 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Wed, 24 Apr 2024 14:38:59 +0200 Subject: [PATCH 85/90] (crawler) Remove unnecessary double-fetch of the root document --- .../nu/marginalia/crawl/retreival/CrawlerRetreiver.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index 98409c01..9ac68ce4 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -117,9 +117,6 @@ public class CrawlerRetreiver implements AutoCloseable { rootUrl = ok.probedUrl(); } - - assert !crawlFrontier.isEmpty(); - final SimpleRobotRules robotsRules = fetcher.fetchRobotRules(rootUrl.domain, warcRecorder); final CrawlDelayTimer delayTimer = new CrawlDelayTimer(robotsRules.getCrawlDelay()); @@ -236,6 +233,9 @@ public class CrawlerRetreiver implements AutoCloseable { catch (Exception ex) { logger.error("Error configuring link filter", ex); } + finally { + crawlFrontier.addVisited(rootUrl); + } } public HttpFetchResult fetchWriteAndSleep(EdgeUrl top, From 3952ef6ca5e6c00ad718a19cd597d8f5e8167c69 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Apr 2024 13:49:57 +0200 Subject: [PATCH 86/90] (service) Let singleservice configure ports and bind addresses --- .../service/discovery/ZkServiceRegistry.java | 6 +- .../java/nu/marginalia/SingleService.java | 81 +++++++++++++++---- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/code/common/service/java/nu/marginalia/service/discovery/ZkServiceRegistry.java b/code/common/service/java/nu/marginalia/service/discovery/ZkServiceRegistry.java index 43fa5a54..446c99a2 100644 --- a/code/common/service/java/nu/marginalia/service/discovery/ZkServiceRegistry.java +++ b/code/common/service/java/nu/marginalia/service/discovery/ZkServiceRegistry.java @@ -140,13 +140,15 @@ public class ZkServiceRegistry implements ServiceRegistryIf { @Override public int requestPort(String externalHost, ServiceKey key) { + if (!Boolean.getBoolean("service.random-port")) { return switch (key) { - case ServiceKey.Rest rest -> 80; - case ServiceKey.Grpc grpc -> 81; + case ServiceKey.Rest rest -> Integer.getInteger("service.http-port", 80); + case ServiceKey.Grpc grpc -> Integer.getInteger("service.grpc-port",81); }; } + int portRangeLow = 12_000; int portRangeHigh = 12_999; diff --git a/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java b/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java index 5a793a0d..e6ba4870 100644 --- a/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java +++ b/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java @@ -1,6 +1,73 @@ package nu.marginalia; +/** Springboard for launching services outside of docker */ public class SingleService { + + public static void main(String... args) { + if (!configure(args)) { + System.out.println("Usage: SingleService bind-address:bind-port-http:bind-port-grpc announce-address [args...]"); + return; + } + + requireEnv("ZOOKEEPER_HOSTS", "Comma-separated list of zookeeper hosts"); + requireEnv("WMSA_HOME", "Path to the install directory of the project"); + + String serviceName = args[0]; + String[] serviceArgs = new String[args.length - 3]; + System.arraycopy(args, 3, serviceArgs, 0, serviceArgs.length); + + for (var service : Service.values()) { + if (service.name.equals(serviceName)) { + service.run(serviceArgs); + } + } + } + + private static void requireEnv(String env, String desc) { + if (System.getenv(env) == null) { + throw new IllegalArgumentException("Missing environment variable: " + env + " - " + desc); + } + else { + System.out.println("Found environment variable: " + env + " = " + System.getenv(env)); + } + } + + /** Set system properties for the address and ports for the service. + * + * @return true if the configuration was successful + * */ + private static boolean configure(String[] args) { + if (args.length < 3) + return false; + + try { + final String bindAddress_http_grpc = args[1]; + final String announceAddress = args[2]; + + final String[] bindParts = bindAddress_http_grpc.split(":"); + if (bindParts.length < 3) + return false; + + String bindAddress = bindParts[0]; + + int httpPort = Integer.parseInt(bindParts[1]); + int grpcPort = Integer.parseInt(bindParts[2]); + + System.out.println("Configuring service with bind address: " + bindAddress + " http port: " + httpPort + " grpc port: " + grpcPort + " announce address: " + announceAddress); + + System.setProperty("service.bind-address", bindAddress); + System.setProperty("service.http-port", Integer.toString(httpPort)); + System.setProperty("service.grpc-port", Integer.toString(grpcPort)); + System.setProperty("service.host", announceAddress); + + return true; + } + catch (NumberFormatException e) { + return false; + } + + } + enum Service { IndexService("index", "nu.marginalia.index.IndexMain"), ControlService("control", "nu.marginalia.control.ControlMain"), @@ -31,19 +98,5 @@ public class SingleService { } } - public static void main(String... args) { - if (args.length == 0) { - System.out.println("Usage: SingleService [args...]"); - } - String serviceName = args[0]; - String[] serviceArgs = new String[args.length - 1]; - System.arraycopy(args, 1, serviceArgs, 0, serviceArgs.length); - - for (var service : Service.values()) { - if (service.name.equals(serviceName)) { - service.run(serviceArgs); - } - } - } } From 6690e9bde8110b6cc99df17291ec485f3bca67b7 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Apr 2024 15:08:33 +0200 Subject: [PATCH 87/90] (service) Ensure the service discovery starts early This is necessary as we use zookeeper to orchestrate first-time startup of the services, to ensure that the database is properly migrated by the control service before anything else is permitted to start. --- .../api-service/java/nu/marginalia/api/ApiMain.java | 5 +++++ .../java/nu/marginalia/dating/DatingMain.java | 4 ++++ .../java/nu/marginalia/explorer/ExplorerMain.java | 4 ++++ .../java/nu/marginalia/search/SearchMain.java | 4 ++++ .../java/nu/marginalia/assistant/AssistantMain.java | 5 +++++ .../java/nu/marginalia/control/ControlMain.java | 4 ++++ .../java/nu/marginalia/executor/ExecutorMain.java | 6 +++++- .../index-service/java/nu/marginalia/index/IndexMain.java | 4 ++++ .../query-service/java/nu/marginalia/query/QueryMain.java | 4 ++++ 9 files changed, 39 insertions(+), 1 deletion(-) diff --git a/code/services-application/api-service/java/nu/marginalia/api/ApiMain.java b/code/services-application/api-service/java/nu/marginalia/api/ApiMain.java index 3eef045a..dd2b739f 100644 --- a/code/services-application/api-service/java/nu/marginalia/api/ApiMain.java +++ b/code/services-application/api-service/java/nu/marginalia/api/ApiMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -23,6 +24,10 @@ public class ApiMain extends MainClass { new DatabaseModule(false), new ServiceDiscoveryModule(), new ServiceConfigurationModule(ServiceId.Api)); + + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(ApiMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/code/services-application/dating-service/java/nu/marginalia/dating/DatingMain.java b/code/services-application/dating-service/java/nu/marginalia/dating/DatingMain.java index cf85016b..3b0655f8 100644 --- a/code/services-application/dating-service/java/nu/marginalia/dating/DatingMain.java +++ b/code/services-application/dating-service/java/nu/marginalia/dating/DatingMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -31,6 +32,9 @@ public class DatingMain extends MainClass { new DatabaseModule(false) ); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(DatingMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/code/services-application/explorer-service/java/nu/marginalia/explorer/ExplorerMain.java b/code/services-application/explorer-service/java/nu/marginalia/explorer/ExplorerMain.java index 5a1fd734..a6b7b4fe 100644 --- a/code/services-application/explorer-service/java/nu/marginalia/explorer/ExplorerMain.java +++ b/code/services-application/explorer-service/java/nu/marginalia/explorer/ExplorerMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -31,6 +32,9 @@ public class ExplorerMain extends MainClass { new DatabaseModule(false) ); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(ExplorerMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java b/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java index 01350592..b93d641b 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -32,6 +33,9 @@ public class SearchMain extends MainClass { new DatabaseModule(false) ); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(SearchMain.class); injector.getInstance(Initialization.class).setReady(); diff --git a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java index 1f583587..cf3606b2 100644 --- a/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java +++ b/code/services-core/assistant-service/java/nu/marginalia/assistant/AssistantMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -28,6 +29,10 @@ public class AssistantMain extends MainClass { new DatabaseModule(false) ); + + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(AssistantMain.class); injector.getInstance(Initialization.class).setReady(); diff --git a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java index 6946f8d6..f329f4d5 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java +++ b/code/services-core/control-service/java/nu/marginalia/control/ControlMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -25,6 +26,9 @@ public class ControlMain extends MainClass { new ServiceDiscoveryModule(), new ServiceConfigurationModule(ServiceId.Control)); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(ControlMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/code/services-core/executor-service/java/nu/marginalia/executor/ExecutorMain.java b/code/services-core/executor-service/java/nu/marginalia/executor/ExecutorMain.java index 1e524bca..e3d308fe 100644 --- a/code/services-core/executor-service/java/nu/marginalia/executor/ExecutorMain.java +++ b/code/services-core/executor-service/java/nu/marginalia/executor/ExecutorMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.DatabaseModule; @@ -28,8 +29,11 @@ public class ExecutorMain extends MainClass { new ServiceDiscoveryModule(), new ServiceConfigurationModule(ServiceId.Executor) ); - injector.getInstance(NodeStatusWatcher.class); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + + injector.getInstance(NodeStatusWatcher.class); injector.getInstance(ExecutorMain.class); injector.getInstance(Initialization.class).setReady(); } diff --git a/code/services-core/index-service/java/nu/marginalia/index/IndexMain.java b/code/services-core/index-service/java/nu/marginalia/index/IndexMain.java index a159ca63..cfde8004 100644 --- a/code/services-core/index-service/java/nu/marginalia/index/IndexMain.java +++ b/code/services-core/index-service/java/nu/marginalia/index/IndexMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -29,6 +30,9 @@ public class IndexMain extends MainClass { new ServiceConfigurationModule(ServiceId.Index) ); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(NodeStatusWatcher.class); injector.getInstance(IndexMain.class); diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryMain.java b/code/services-core/query-service/java/nu/marginalia/query/QueryMain.java index f08491cd..8d0c1972 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryMain.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryMain.java @@ -4,6 +4,7 @@ import com.google.inject.Guice; import com.google.inject.Inject; import com.google.inject.Injector; import nu.marginalia.service.MainClass; +import nu.marginalia.service.discovery.ServiceRegistryIf; import nu.marginalia.service.module.ServiceDiscoveryModule; import nu.marginalia.service.ServiceId; import nu.marginalia.service.module.ServiceConfigurationModule; @@ -28,6 +29,9 @@ public class QueryMain extends MainClass { new ServiceConfigurationModule(ServiceId.Query) ); + // Ensure that the service registry is initialized early + injector.getInstance(ServiceRegistryIf.class); + injector.getInstance(QueryMain.class); injector.getInstance(Initialization.class).setReady(); } From 4e5f069809c0884122f274c72ecc83d461d93773 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Apr 2024 15:08:56 +0200 Subject: [PATCH 88/90] (build) Migrate ssr to the new root setting schema of java lang version --- code/services-core/single-service-runner/build.gradle | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/code/services-core/single-service-runner/build.gradle b/code/services-core/single-service-runner/build.gradle index bec9eb54..47a09fed 100644 --- a/code/services-core/single-service-runner/build.gradle +++ b/code/services-core/single-service-runner/build.gradle @@ -7,13 +7,14 @@ plugins { application { mainClass = 'nu.marginalia.SingleService' applicationName = 'marginalia' + applicationDefaultJvmArgs = [ "--enable-preview" ] } tasks.distZip.enabled = false java { toolchain { - languageVersion.set(JavaLanguageVersion.of(22)) + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) } } From c8ee354d0b9202c1f3e9506d0c5d76ba7476a079 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Apr 2024 15:09:18 +0200 Subject: [PATCH 89/90] (log) Make log dir configurable via environment variable --- code/common/service/resources/log4j2-json.xml | 2 +- code/common/service/resources/log4j2-prod.xml | 2 +- code/common/service/resources/log4j2-test.xml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/code/common/service/resources/log4j2-json.xml b/code/common/service/resources/log4j2-json.xml index dd2fc6d2..760e04c3 100644 --- a/code/common/service/resources/log4j2-json.xml +++ b/code/common/service/resources/log4j2-json.xml @@ -7,7 +7,7 @@ - diff --git a/code/common/service/resources/log4j2-prod.xml b/code/common/service/resources/log4j2-prod.xml index 01c914ac..5dbc69b9 100644 --- a/code/common/service/resources/log4j2-prod.xml +++ b/code/common/service/resources/log4j2-prod.xml @@ -7,7 +7,7 @@ - %-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n diff --git a/code/common/service/resources/log4j2-test.xml b/code/common/service/resources/log4j2-test.xml index 8fd0b262..2d9ddb94 100644 --- a/code/common/service/resources/log4j2-test.xml +++ b/code/common/service/resources/log4j2-test.xml @@ -6,7 +6,7 @@ - %-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n From 89889ecbbd0c9de929ffdfc57dd8fe47837b35c6 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Thu, 25 Apr 2024 17:54:07 +0200 Subject: [PATCH 90/90] (single-service) Skip starting Prometheus if it's not explicitly enabled --- .../service/module/ServiceConfigurationModule.java | 6 +++++- .../nu/marginalia/service/server/MetricsServer.java | 4 ++++ .../java/nu/marginalia/SingleService.java | 12 +++++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java b/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java index 157f6673..49961c17 100644 --- a/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java +++ b/code/common/service/java/nu/marginalia/service/module/ServiceConfigurationModule.java @@ -36,11 +36,15 @@ public class ServiceConfigurationModule extends AbstractModule { private int getPrometheusPort() { String prometheusPortEnv = System.getenv("WMSA_PROMETHEUS_PORT"); - if (prometheusPortEnv != null) { return Integer.parseInt(prometheusPortEnv); } + Integer prometheusPortProperty = Integer.getInteger("service.prometheus-port"); + if (prometheusPortProperty != null) { + return prometheusPortProperty; + } + return 7000; } diff --git a/code/common/service/java/nu/marginalia/service/server/MetricsServer.java b/code/common/service/java/nu/marginalia/service/server/MetricsServer.java index 7dc52d9e..dcb796b4 100644 --- a/code/common/service/java/nu/marginalia/service/server/MetricsServer.java +++ b/code/common/service/java/nu/marginalia/service/server/MetricsServer.java @@ -13,6 +13,10 @@ public class MetricsServer { @SneakyThrows @Inject public MetricsServer(ServiceConfiguration configuration) { + // If less than zero, we forego setting up a metrics server + if (configuration.metricsPort() < 0) + return; + Server server = new Server(configuration.metricsPort()); ServletContextHandler context = new ServletContextHandler(); context.setContextPath("/"); diff --git a/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java b/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java index e6ba4870..b6a8854e 100644 --- a/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java +++ b/code/services-core/single-service-runner/java/nu/marginalia/SingleService.java @@ -53,13 +53,23 @@ public class SingleService { int httpPort = Integer.parseInt(bindParts[1]); int grpcPort = Integer.parseInt(bindParts[2]); - System.out.println("Configuring service with bind address: " + bindAddress + " http port: " + httpPort + " grpc port: " + grpcPort + " announce address: " + announceAddress); + System.out.println(STR.""" + Configuring service with bind address: \{bindAddress} + http port: \{httpPort} + grpc port: \{grpcPort} + announce address: \{announceAddress} + """); System.setProperty("service.bind-address", bindAddress); System.setProperty("service.http-port", Integer.toString(httpPort)); System.setProperty("service.grpc-port", Integer.toString(grpcPort)); System.setProperty("service.host", announceAddress); + // By default, disable prometheus metrics + if (System.getProperty("service.prometheus-port") == null) { + System.setProperty("service.prometheus-port", "-1"); + } + return true; } catch (NumberFormatException e) {