mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
HyperLogLog-tool for figuring out how big the index is.
This commit is contained in:
parent
17226bc4fd
commit
6df02f7528
@ -158,6 +158,8 @@ dependencies {
|
|||||||
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
jmh 'org.openjdk.jmh:jmh-core:1.35'
|
||||||
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
jmh 'org.openjdk.jmh:jmh-generator-annprocess:1.35'
|
||||||
|
|
||||||
|
implementation 'net.agkn:hll:1.6.0'
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
configurations {
|
configurations {
|
||||||
|
@ -82,7 +82,8 @@ public enum ServiceDescriptor {
|
|||||||
new ConvertCommand(),
|
new ConvertCommand(),
|
||||||
new LoadCommand(),
|
new LoadCommand(),
|
||||||
new ReindexCommand(),
|
new ReindexCommand(),
|
||||||
new VersionCommand()
|
new VersionCommand(),
|
||||||
|
new IndexDataDumpCommand()
|
||||||
).collect(Collectors.toMap(c -> c.name, c -> c));
|
).collect(Collectors.toMap(c -> c.name, c -> c));
|
||||||
|
|
||||||
if(args.length > 0) {
|
if(args.length > 0) {
|
||||||
|
@ -0,0 +1,24 @@
|
|||||||
|
package nu.marginalia.wmsa.configuration.command;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.wmsa.edge.tools.IndexJournalDumpTool;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public class IndexDataDumpCommand extends Command {
|
||||||
|
public IndexDataDumpCommand() {
|
||||||
|
super("index-dump");
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
@Override
|
||||||
|
public void execute(String... args) {
|
||||||
|
if (args.length < 1) {
|
||||||
|
System.err.println("Usage: index-dump [sub-command] index.dat");
|
||||||
|
System.exit(255);
|
||||||
|
}
|
||||||
|
|
||||||
|
String[] args2 = Arrays.copyOfRange(args, 1, args.length);
|
||||||
|
IndexJournalDumpTool.main(args2);
|
||||||
|
}
|
||||||
|
}
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.wmsa.edge.tools;
|
package nu.marginalia.wmsa.edge.tools;
|
||||||
|
|
||||||
|
import com.google.common.hash.Hashing;
|
||||||
|
import net.agkn.hll.HLL;
|
||||||
import nu.marginalia.util.multimap.MultimapFileLong;
|
import nu.marginalia.util.multimap.MultimapFileLong;
|
||||||
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
|
import nu.marginalia.wmsa.edge.index.journal.SearchIndexJournalReader;
|
||||||
|
|
||||||
@ -8,7 +10,36 @@ import java.nio.file.Path;
|
|||||||
|
|
||||||
public class IndexJournalDumpTool {
|
public class IndexJournalDumpTool {
|
||||||
public static void main(String... args) throws IOException {
|
public static void main(String... args) throws IOException {
|
||||||
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(Path.of(args[0])));
|
final String operation = args.length > 0 ? args[0] : "help";
|
||||||
|
|
||||||
|
switch (operation) {
|
||||||
|
case "dump":
|
||||||
|
dump(Path.of(args[1]));
|
||||||
|
break;
|
||||||
|
case "cardinality":
|
||||||
|
cardinality(Path.of(args[1]));
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
System.err.println("Usage: dump|cardinality index-file.dat");
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void cardinality(Path file) throws IOException {
|
||||||
|
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file));
|
||||||
|
HLL hyperloglog = new HLL(30, 1);
|
||||||
|
var hashFunction = Hashing.murmur3_128();
|
||||||
|
|
||||||
|
for (var entry : reader) {
|
||||||
|
hyperloglog.addRaw(hashFunction.hashLong(entry.docId()).padToLong());
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println(hyperloglog.cardinality());
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void dump(Path file) throws IOException {
|
||||||
|
var reader = new SearchIndexJournalReader(MultimapFileLong.forReading(file));
|
||||||
for (var entry : reader) {
|
for (var entry : reader) {
|
||||||
System.out.printf("%s\t%010d\t%06d:%08d\n", entry.block(), entry.docId(), entry.domainId(), entry.urlId());
|
System.out.printf("%s\t%010d\t%06d:%08d\n", entry.block(), entry.docId(), entry.domainId(), entry.urlId());
|
||||||
}
|
}
|
||||||
|
@ -36,6 +36,7 @@ class QueryVariantsTest {
|
|||||||
@Test
|
@Test
|
||||||
void getQueryVariants() {
|
void getQueryVariants() {
|
||||||
System.out.println(se.extractSentence("we are alone"));
|
System.out.println(se.extractSentence("we are alone"));
|
||||||
|
testCase("inside job reviews");
|
||||||
testCase("DOS");
|
testCase("DOS");
|
||||||
testCase("dos");
|
testCase("dos");
|
||||||
testCase("we are alone");
|
testCase("we are alone");
|
||||||
|
Loading…
Reference in New Issue
Block a user