diff --git a/marginalia_nu/build.gradle b/marginalia_nu/build.gradle index 1be5b722..7e0884db 100644 --- a/marginalia_nu/build.gradle +++ b/marginalia_nu/build.gradle @@ -158,6 +158,8 @@ dependencies { jmh 'org.openjdk.jmh:jmh-core:1.35' jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35' + implementation 'net.agkn:hll:1.6.0' + } configurations { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java index 1a8c7f8d..4fc59afe 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/ServiceDescriptor.java @@ -82,7 +82,8 @@ public enum ServiceDescriptor { new ConvertCommand(), new LoadCommand(), new ReindexCommand(), - new VersionCommand() + new VersionCommand(), + new IndexDataDumpCommand() ).collect(Collectors.toMap(c -> c.name, c -> c)); if(args.length > 0) { diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/IndexDataDumpCommand.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/IndexDataDumpCommand.java new file mode 100644 index 00000000..75ea02c7 --- /dev/null +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/configuration/command/IndexDataDumpCommand.java @@ -0,0 +1,24 @@ +package nu.marginalia.wmsa.configuration.command; + +import lombok.SneakyThrows; +import nu.marginalia.wmsa.edge.tools.IndexJournalDumpTool; + +import java.util.Arrays; + +public class IndexDataDumpCommand extends Command { + public IndexDataDumpCommand() { + super("index-dump"); + } + + @SneakyThrows + @Override + public void execute(String... args) { + if (args.length < 1) { + System.err.println("Usage: index-dump [sub-command] index.dat"); + System.exit(255); + } + + String[] args2 = Arrays.copyOfRange(args, 1, args.length); + IndexJournalDumpTool.main(args2); + } +} diff --git a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java index eebabaa2..dd61ea28 100644 --- a/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java +++ b/marginalia_nu/src/main/java/nu/marginalia/wmsa/edge/tools/IndexJournalDumpTool.java @@ -1,5 +1,7 @@ package nu.marginalia.wmsa.edge.tools; +import com.google.common.hash.Hashing; +import net.agkn.hll.HLL; import nu.marginalia.util.multimap.MultimapFileLong; import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader; @@ -8,7 +10,36 @@ import java.nio.file.Path; public class IndexJournalDumpTool { public static void main(String... args) throws IOException { - var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(Path.of(args[0]))); + final String operation = args.length > 0 ? args[0] : "help"; + + switch (operation) { + case "dump": + dump(Path.of(args[1])); + break; + case "cardinality": + cardinality(Path.of(args[1])); + break; + default: + System.err.println("Usage: dump|cardinality index-file.dat"); + break; + } + + } + + private static void cardinality(Path file) throws IOException { + var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file)); + HLL hyperloglog = new HLL(30, 1); + var hashFunction = Hashing.murmur3_128(); + + for (var entry : reader) { + hyperloglog.addRaw(hashFunction.hashLong(entry.docId()).padToLong()); + } + + System.out.println(hyperloglog.cardinality()); + } + + private static void dump(Path file) throws IOException { + var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file)); for (var entry : reader) { System.out.printf("%s\t%010d\t%06d:%08d\n", entry.block(), entry.docId(), entry.domainId(), entry.urlId()); } diff --git a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java index 597341bc..79946d82 100644 --- a/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java +++ b/marginalia_nu/src/test/java/nu/marginalia/wmsa/edge/search/query/QueryVariantsTest.java @@ -36,6 +36,7 @@ class QueryVariantsTest { @Test void getQueryVariants() { System.out.println(se.extractSentence("we are alone")); + testCase("inside job reviews"); testCase("DOS"); testCase("dos"); testCase("we are alone");