diff --git a/build.gradle b/build.gradle index dad52fa3..f2307cb5 100644 --- a/build.gradle +++ b/build.gradle @@ -6,7 +6,7 @@ plugins { // This is a workaround for a bug in the Jib plugin that causes it to stall randomly // https://github.com/GoogleContainerTools/jib/issues/3347 - id 'com.google.cloud.tools.jib' version '3.4.2' apply(false) + id 'com.google.cloud.tools.jib' version '3.4.3' apply(false) } group 'marginalia' @@ -44,10 +44,11 @@ subprojects.forEach {it -> } ext { - jvmVersion=21 - dockerImageBase='container-registry.oracle.com/graalvm/jdk:21@sha256:1fd33d4d4eba3a9e1a41a728e39ea217178d257694eea1214fec68d2ed4d3d9b' + jvmVersion=22 + dockerImageBase='container-registry.oracle.com/graalvm/jdk:22' dockerImageTag='latest' dockerImageRegistry='marginalia' + jibVersion = '3.4.3' } idea { diff --git a/code/common/config/build.gradle b/code/common/config/build.gradle index d3628671..e78e8a9c 100644 --- a/code/common/config/build.gradle +++ b/code/common/config/build.gradle @@ -33,6 +33,7 @@ dependencies { testImplementation project(':code:libraries:test-helpers') testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/common/db/build.gradle b/code/common/db/build.gradle index 4f32b50d..f0fe081f 100644 --- a/code/common/db/build.gradle +++ b/code/common/db/build.gradle @@ -54,6 +54,7 @@ dependencies { testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/common/linkdb/build.gradle b/code/common/linkdb/build.gradle index 1d6d34d0..b95743f6 100644 --- a/code/common/linkdb/build.gradle +++ b/code/common/linkdb/build.gradle @@ -41,6 +41,7 @@ dependencies { testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java index ba48f3ec..5b6112fe 100644 --- a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java +++ b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbReader.java @@ -22,6 +22,12 @@ import java.sql.SQLException; import java.util.ArrayList; import java.util.List; +/** Reads the document database, which is a SQLite database + * containing the URLs and metadata of the documents in the + * index. + *

+ * The database is created by the DocumentDbWriter class. + * */ @Singleton public class DocumentDbReader { private final Path dbFile; @@ -52,6 +58,11 @@ public class DocumentDbReader { } } + /** Switches the input database file to a new file. + *

+ * This is used to switch over to a new database file + * when the index is re-indexed. + * */ public void switchInput(Path newDbFile) throws IOException, SQLException { if (!Files.isRegularFile(newDbFile)) { logger.error("Source is not a file, refusing switch-over {}", newDbFile); @@ -78,35 +89,11 @@ public class DocumentDbReader { connection = createConnection(); } - public List getUrlsFromDomain(int domainId) throws SQLException { - if (connection == null || - connection.isClosed()) - { - throw new RuntimeException("URL query temporarily unavailable due to database switch"); - } - - long minId = UrlIdCodec.encodeId(domainId, 0); - long maxId = UrlIdCodec.encodeId(domainId+1, 0); - - List ret = new ArrayList<>(); - - try (var stmt = connection.prepareStatement(""" - SELECT URL - FROM DOCUMENT - WHERE ID >= ? AND ID < ? - """)) - { - stmt.setLong(1, minId); - stmt.setLong(2, maxId); - var rs = stmt.executeQuery(); - while (rs.next()) { - ret.add(rs.getString(1)); - } - } - - return ret; - } - + /** Returns the URL details for the given document ids. + *

+ * This is used to get the URL details for the search + * results. + * */ public List getUrlDetails(TLongList ids) throws SQLException { List ret = new ArrayList<>(ids.size()); diff --git a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java index e843e826..faa98bf5 100644 --- a/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java +++ b/code/common/linkdb/java/nu/marginalia/linkdb/docs/DocumentDbWriter.java @@ -9,6 +9,10 @@ import java.sql.DriverManager; import java.sql.SQLException; import java.util.List; +/** Writes the document database, which is a SQLite database + * containing the URLs and metadata of the documents in the + * index. + * */ public class DocumentDbWriter { private final Connection connection; diff --git a/code/common/model/build.gradle b/code/common/model/build.gradle index a424efca..3b9d87c3 100644 --- a/code/common/model/build.gradle +++ b/code/common/model/build.gradle @@ -14,6 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:braille-block-punch-cards') + implementation project(':code:libraries:coded-sequence') implementation libs.bundles.slf4j diff --git a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java index 26ac847e..f9514c3a 100644 --- a/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java +++ b/code/common/model/java/nu/marginalia/model/id/UrlIdCodec.java @@ -37,9 +37,24 @@ public class UrlIdCodec { domainId &= 0x7FFF_FFFF; documentOrdinal &= 0x03FF_FFFF; + assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId; + assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal; + return ((long) domainId << 26) | documentOrdinal; } + /** Encode a URL id with a ranking element */ + public static long encodeId(int rank, int domainId, int documentOrdinal) { + assert (rank & 0x3F) == rank : "Rank must be in [0, 63], was " + rank; + assert (domainId & 0x7FFF_FFFF) == domainId : "Domain id must be in [0, 2^31-1], was " + domainId; + assert (documentOrdinal & 0x03FF_FFFF) == documentOrdinal : "Document ordinal must be in [0, 2^26-1], was " + documentOrdinal; + + domainId &= 0x7FFF_FFFF; + documentOrdinal &= 0x03FF_FFFF; + rank &= 0x3F; + + return ((long) rank << 57) | ((long) domainId << 26) | documentOrdinal; + } /** Add a ranking element to an existing combined URL id. * * @param rank [0,1] the importance of the domain, low is good @@ -67,7 +82,7 @@ public class UrlIdCodec { /** Extract the document ordinal component from this URL id */ public static int getRank(long combinedId) { - return (int) (combinedId >>> 57); + return (int) (combinedId >>> 57) & 0x3F; } /** Mask out the ranking element from this URL id */ diff --git a/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java b/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java new file mode 100644 index 00000000..5be7dcdb --- /dev/null +++ b/code/common/model/java/nu/marginalia/model/idx/CodedWordSpan.java @@ -0,0 +1,6 @@ +package nu.marginalia.model.idx; + +import nu.marginalia.sequence.VarintCodedSequence; + +public record CodedWordSpan(byte code, VarintCodedSequence spans) { +} diff --git a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java index db54df77..77baed4c 100644 --- a/code/common/model/java/nu/marginalia/model/idx/WordFlags.java +++ b/code/common/model/java/nu/marginalia/model/idx/WordFlags.java @@ -4,16 +4,12 @@ package nu.marginalia.model.idx; import java.util.EnumSet; public enum WordFlags { - /** Word appears in title */ Title, /** Word appears to be the subject in several sentences */ Subjects, - /** Word has high tf-idf */ - TfIdfHigh, - /** Word is a likely named object. This is a weaker version of Subjects. */ NamesWords, @@ -42,19 +38,27 @@ public enum WordFlags { ExternalLink ; - public int asBit() { - return 1 << ordinal(); + public byte asBit() { + return (byte) (1 << ordinal()); } - public boolean isPresent(long value) { + public boolean isPresent(byte value) { return (asBit() & value) > 0; } - public boolean isAbsent(long value) { + public boolean isAbsent(byte value) { return (asBit() & value) == 0; } - public static EnumSet decode(long encodedValue) { + public static byte encode(EnumSet flags) { + byte ret = 0; + for (WordFlags f : flags) { + ret |= f.asBit(); + } + return ret; + } + + public static EnumSet decode(byte encodedValue) { EnumSet ret = EnumSet.noneOf(WordFlags.class); for (WordFlags f : values()) { diff --git a/code/common/model/java/nu/marginalia/model/idx/WordMetadata.java b/code/common/model/java/nu/marginalia/model/idx/WordMetadata.java deleted file mode 100644 index 1f1add44..00000000 --- a/code/common/model/java/nu/marginalia/model/idx/WordMetadata.java +++ /dev/null @@ -1,89 +0,0 @@ -package nu.marginalia.model.idx; - - -import nu.marginalia.bbpc.BrailleBlockPunchCards; - -import java.util.EnumSet; -import java.util.Set; - -/** Word level metadata designed to fit in a single 64 bit long. - * - * @param positions bitmask of term positions within the document - * @param flags word flags (see {@link WordFlags}) - */ -public record WordMetadata(long positions, - int flags) { - - public static final long FLAGS_MASK = (1L << WordFlags.values().length) - 1; - public static final int POSITIONS_COUNT = 64 - WordFlags.values().length; - public static final int POSITIONS_SHIFT = WordFlags.values().length; - public static final long POSITIONS_MASK = ~0L >>> POSITIONS_SHIFT; - - - - public WordMetadata() { - this(emptyValue()); - } - - public WordMetadata(long value) { - this( - ((value >>> POSITIONS_SHIFT) & POSITIONS_MASK), - (int)(value & FLAGS_MASK) - ); - } - - public WordMetadata(long positions, - Set flags) - { - this(positions, encodeFlags(flags)); - } - - private static int encodeFlags(Set flags) { - int ret = 0; - for (var flag : flags) { ret |= flag.asBit(); } - return ret; - } - - public static boolean hasFlags(long encoded, long metadataBitMask) { - return (encoded & metadataBitMask) == metadataBitMask; - } - public static boolean hasAnyFlags(long encoded, long metadataBitMask) { - return (encoded & metadataBitMask) != 0; - } - public static long decodePositions(long meta) { - return (meta >>> POSITIONS_SHIFT) & POSITIONS_MASK; - } - - public boolean hasFlag(WordFlags flag) { - return (flags & flag.asBit()) != 0; - } - - public String toString() { - return "[positions=%s; %s]".formatted(BrailleBlockPunchCards.printBits(positions, 56), flagSet()); - } - - /* Encoded in a 64 bit long - */ - public long encode() { - long ret = 0; - - ret |= Integer.toUnsignedLong(flags) & FLAGS_MASK; - ret |= (positions & POSITIONS_MASK) << POSITIONS_SHIFT; - - return ret; - } - - public boolean isEmpty() { - return positions == 0 && flags == 0; - } - - public static long emptyValue() { - return 0L; - } - - - public EnumSet flagSet() { - return WordFlags.decode(flags); - } - -} diff --git a/code/common/model/java/nu/marginalia/util/QueryParams.java b/code/common/model/java/nu/marginalia/util/QueryParams.java index ce970d2f..1869c102 100644 --- a/code/common/model/java/nu/marginalia/util/QueryParams.java +++ b/code/common/model/java/nu/marginalia/util/QueryParams.java @@ -10,7 +10,6 @@ import java.util.StringJoiner; public class QueryParams { - @Nullable public static String queryParamsSanitizer(String path, @Nullable String queryParams) { if (queryParams == null) { diff --git a/code/common/model/java/nu/marginalia/util/StringPool.java b/code/common/model/java/nu/marginalia/util/StringPool.java deleted file mode 100644 index 6d7ea8b5..00000000 --- a/code/common/model/java/nu/marginalia/util/StringPool.java +++ /dev/null @@ -1,70 +0,0 @@ -package nu.marginalia.util; - -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.Objects; - -public class StringPool { - - private final HashMap words; - private final Object2LongOpenHashMap ages; - private final int maxCap; - - long idx; - - private StringPool(int capacity, int maxCap) { - this.ages = new Object2LongOpenHashMap<>(capacity); - this.words = new HashMap<>(capacity); - this.maxCap = maxCap; - } - - public static StringPool create(int capacity) { - return new StringPool(capacity, capacity * 10); - } - - public String internalize(String str) { - prune(); - - final String ret = words.putIfAbsent(str, str); - ages.put(ret, idx++); - - return Objects.requireNonNullElse(ret, str); - } - - public String[] internalize(String[] str) { - - for (int i = 0; i < str.length; i++) { - str[i] = internalize(str[i]); - } - - return str; - } - - public void prune() { - - if (words.size() < maxCap) - return; - - long[] ageValues = ages.values().toLongArray(); - Arrays.sort(ageValues); - - long cutoff = ageValues[ageValues.length - maxCap / 10]; - - words.clear(); - ages.forEach((word, cnt) -> { - if (cnt >= cutoff) { - words.put(word, word); - } - }); - ages.clear(); - words.forEach((w,w2) -> { - ages.put(w, idx); - }); - } - - public void flush() { - words.clear(); - } -} diff --git a/code/common/model/readme.md b/code/common/model/readme.md index d07bb4fa..60457102 100644 --- a/code/common/model/readme.md +++ b/code/common/model/readme.md @@ -8,5 +8,4 @@ This package contains common models to the search engine * [EdgeUrl](java/nu/marginalia/model/EdgeUrl.java) * [DocumentMetadata](java/nu/marginalia/model/idx/DocumentMetadata.java) * [DocumentFlags](java/nu/marginalia/model/idx/DocumentFlags.java) -* [WordMetadata](java/nu/marginalia/model/idx/WordMetadata.java) * [WordFlags](java/nu/marginalia/model/idx/WordFlags.java) \ No newline at end of file diff --git a/code/common/model/test/nu/marginalia/model/WordMetadataTest.java b/code/common/model/test/nu/marginalia/model/WordMetadataTest.java deleted file mode 100644 index 6de3179b..00000000 --- a/code/common/model/test/nu/marginalia/model/WordMetadataTest.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.model; - -import nu.marginalia.bbpc.BrailleBlockPunchCards; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; -import org.junit.jupiter.api.Test; - -import java.util.EnumSet; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class WordMetadataTest { - - @Test - public void codecTest() { - verifyCodec("Vanilla case", new WordMetadata(0x7f0f0000L, EnumSet.allOf(WordFlags.class))); - verifyCodec("Position 32bit", new WordMetadata(0xff0f0000L, EnumSet.allOf(WordFlags.class))); - verifyCodec("Position all", new WordMetadata(0xffff_ff0f_0000L, EnumSet.allOf(WordFlags.class))); - verifyCodec("No flags", new WordMetadata( 0xff0f0000L, EnumSet.noneOf(WordFlags.class))); - verifyCodec("No flags, some bits", new WordMetadata(0x3f_7f7f_7f7f_7f7fL, EnumSet.noneOf(WordFlags.class))); - verifyCodec("No flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.noneOf(WordFlags.class))); - verifyCodec("All flags, all bits", new WordMetadata( 0x3f_ffff_ffff_ffffL, EnumSet.allOf(WordFlags.class))); - System.out.println(new WordMetadata(0x7f0f0005L, EnumSet.allOf(WordFlags.class))); - System.out.println(new WordMetadata(0xff0f0013L, EnumSet.noneOf(WordFlags.class))); - System.out.println(new WordMetadata(0xf0f000ff0f0013L, EnumSet.allOf(WordFlags.class))); - System.out.println(new WordMetadata(0xf0f000ff0f0013L, (byte)-1)); - System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0)); - System.out.println(new WordMetadata(0x3f_ffff_ffff_ffffL, (byte)0)); - System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(~0L, (byte) 0).encode(), 64)); - System.out.println(BrailleBlockPunchCards.printBits(new WordMetadata(0, (byte) 0xff).encode(), 64)); - System.out.println(BrailleBlockPunchCards.printBits(131973L, 64)); - System.out.println(new WordMetadata(131973L)); - } - - public void verifyCodec(String message, WordMetadata data) { - System.out.println(BrailleBlockPunchCards.printBits(data.encode(), 64)); - assertEquals(data, new WordMetadata(data.encode()), message); - } - - -} \ No newline at end of file diff --git a/code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java b/code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java index 619dd101..95d4345b 100644 --- a/code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java +++ b/code/common/process/java/nu/marginalia/process/control/FakeProcessHeartbeat.java @@ -1,13 +1,18 @@ package nu.marginalia.process.control; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** Dummy implementation of ProcessHeartbeat that does nothing */ public class FakeProcessHeartbeat implements ProcessHeartbeat { - + private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class); @Override public > ProcessTaskHeartbeat createProcessTaskHeartbeat(Class steps, String processName) { return new ProcessTaskHeartbeat<>() { @Override - public void progress(T step) {} + public void progress(T step) { + logger.info("Progress: {}", step); + } @Override public void shutDown() {} @@ -21,7 +26,9 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat { public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) { return new ProcessAdHocTaskHeartbeat() { @Override - public void progress(String step, int progress, int total) {} + public void progress(String step, int progress, int total) { + logger.info("Progress: {}, {}/{}", step, progress, total); + } @Override public void close() {} diff --git a/code/common/service/build.gradle b/code/common/service/build.gradle index 8cc9583e..4b2b4f1d 100644 --- a/code/common/service/build.gradle +++ b/code/common/service/build.gradle @@ -46,6 +46,7 @@ dependencies { implementation libs.bundles.mariadb testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/common/service/java/nu/marginalia/service/client/GrpcMultiNodeChannelPool.java b/code/common/service/java/nu/marginalia/service/client/GrpcMultiNodeChannelPool.java index d4f75e66..de74adb4 100644 --- a/code/common/service/java/nu/marginalia/service/client/GrpcMultiNodeChannelPool.java +++ b/code/common/service/java/nu/marginalia/service/client/GrpcMultiNodeChannelPool.java @@ -64,6 +64,11 @@ public class GrpcMultiNodeChannelPool { return nodeConfigurationWatcher.getQueryNodes(); } + /** Return the number of nodes that are eligible for broadcast-style requests */ + public int getNumNodes() { + return nodeConfigurationWatcher.getQueryNodes().size(); + } + /** Create a new call builder for the given method. This is a fluent-style * method, where you can chain calls to specify how to run the method. *

diff --git a/code/common/service/resources/log4j2-test.xml b/code/common/service/resources/log4j2-test.xml index 0181775c..6f67fb7f 100644 --- a/code/common/service/resources/log4j2-test.xml +++ b/code/common/service/resources/log4j2-test.xml @@ -2,22 +2,7 @@ - - - - - - %-5level %d{yyyy-MM-dd HH:mm:ss,SSS} %-20t %-20c{1}: %msg{nolookups}%n - - - - - - - - diff --git a/code/execution/build.gradle b/code/execution/build.gradle index 973f13c9..ae22f2ea 100644 --- a/code/execution/build.gradle +++ b/code/execution/build.gradle @@ -38,15 +38,13 @@ dependencies { implementation project(':code:functions:search-query') implementation project(':code:execution:api') - implementation project(':code:process-models:crawl-spec') - implementation project(':code:process-models:crawling-model') - implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-convert:data-extractors') - implementation project(':code:features-convert:stackexchange-xml') - implementation project(':code:features-convert:reddit-json') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:execution:data-extractors') implementation project(':code:index:index-journal') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':third-party:encyclopedia-marginalia-nu') implementation libs.bundles.slf4j @@ -84,6 +82,7 @@ dependencies { testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/features-convert/data-extractors/build.gradle b/code/execution/data-extractors/build.gradle similarity index 84% rename from code/features-convert/data-extractors/build.gradle rename to code/execution/data-extractors/build.gradle index f8841120..2a0c08c6 100644 --- a/code/features-convert/data-extractors/build.gradle +++ b/code/execution/data-extractors/build.gradle @@ -22,9 +22,9 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:libraries:blocking-thread-pool') - implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-convert:anchor-keywords') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:processes:converting-process:ft-anchor-keywords') + implementation project(':code:processes:crawling-process:model') implementation project(':code:processes:converting-process') implementation project(':third-party:commons-codec') diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java similarity index 97% rename from code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java rename to code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java index acc3a417..d2f2c91b 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/AtagExporter.java +++ b/code/execution/data-extractors/java/nu/marginalia/extractor/AtagExporter.java @@ -3,13 +3,13 @@ package nu.marginalia.extractor; import com.google.inject.Inject; import gnu.trove.set.hash.TLongHashSet; import lombok.SneakyThrows; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/ExporterIf.java b/code/execution/data-extractors/java/nu/marginalia/extractor/ExporterIf.java similarity index 100% rename from code/features-convert/data-extractors/java/nu/marginalia/extractor/ExporterIf.java rename to code/execution/data-extractors/java/nu/marginalia/extractor/ExporterIf.java diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java similarity index 96% rename from code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java rename to code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java index fa925b39..547b810b 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/FeedExporter.java +++ b/code/execution/data-extractors/java/nu/marginalia/extractor/FeedExporter.java @@ -2,13 +2,13 @@ package nu.marginalia.extractor; import com.google.inject.Inject; import lombok.SneakyThrows; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.link_parser.FeedExtractor; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/SampleDataExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/SampleDataExporter.java similarity index 100% rename from code/features-convert/data-extractors/java/nu/marginalia/extractor/SampleDataExporter.java rename to code/execution/data-extractors/java/nu/marginalia/extractor/SampleDataExporter.java diff --git a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java b/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java similarity index 77% rename from code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java rename to code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java index 3bcc9cf2..2545d666 100644 --- a/code/features-convert/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java +++ b/code/execution/data-extractors/java/nu/marginalia/extractor/TermFrequencyExporter.java @@ -5,11 +5,11 @@ import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.set.hash.TLongHashSet; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.process.log.WorkLog; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorage; @@ -27,7 +27,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardCopyOption; import java.nio.file.attribute.PosixFilePermissions; -import java.util.concurrent.ForkJoinPool; +import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; @@ -97,8 +97,13 @@ public class TermFrequencyExporter implements ExporterIf { } - private void processFile(Path crawlDataPath, TLongIntHashMap counts, AtomicInteger docCount, SentenceExtractor se) { - TLongHashSet words = new TLongHashSet(10_000); + private void processFile(Path crawlDataPath, + TLongIntHashMap counts, + AtomicInteger docCount, + SentenceExtractor se) + { + TLongHashSet words = new TLongHashSet(1000); + try (var stream = CrawledDomainReader.createDataStream(crawlDataPath)) { while (stream.hasNext()) { if (Thread.interrupted()) @@ -120,19 +125,33 @@ public class TermFrequencyExporter implements ExporterIf { return; } - for (var sent : dld.sentences) { + for (var sent : dld) { + // Skip sentences with non-language tags, e.g. program code + if (sent.htmlTags.stream().anyMatch(t -> t.nonLanguage)) + continue; + for (var word : sent) { words.add(longHash(word.stemmed().getBytes(StandardCharsets.UTF_8))); } - - for (var ngram : sent.ngramStemmed) { - words.add(longHash(ngram.getBytes())); - } } + var random = ThreadLocalRandom.current(); synchronized (counts) { words.forEach(w -> { - counts.adjustOrPutValue(w, 1, 1); + // Mathematicians hate him for this one weird trick: + // + // We generally aren't interested in low-frequency entries, + // but due to zipf's law, there are a lot of them, in fact + // almost the entire term frequency dictionary is full of them. + // + // So we use a simple statistical trick to reduce the number + // of nearly unique entries in the dictionary, while still keeping the + // distribution of higher-frequency entries relatively intact + + if (random.nextDouble() < 0.2) { + counts.adjustOrPutValue(w, 5, 5); + } + return true; }); } diff --git a/code/features-convert/data-extractors/readme.md b/code/execution/data-extractors/readme.md similarity index 100% rename from code/features-convert/data-extractors/readme.md rename to code/execution/data-extractors/readme.md diff --git a/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java b/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java index 085dffed..b4446199 100644 --- a/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java +++ b/code/execution/java/nu/marginalia/actor/task/ConvertAndLoadActor.java @@ -6,19 +6,11 @@ import com.google.inject.Singleton; import lombok.AllArgsConstructor; import lombok.NoArgsConstructor; import lombok.With; +import nu.marginalia.IndexLocations; import nu.marginalia.actor.prototype.RecordActorPrototype; import nu.marginalia.actor.state.ActorResumeBehavior; import nu.marginalia.actor.state.ActorStep; import nu.marginalia.actor.state.Resume; -import nu.marginalia.nodecfg.NodeConfigurationService; -import nu.marginalia.process.ProcessOutboxes; -import nu.marginalia.process.ProcessService; -import nu.marginalia.service.module.ServiceConfiguration; -import nu.marginalia.storage.model.FileStorageState; -import nu.marginalia.svc.BackupService; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageId; -import nu.marginalia.storage.model.FileStorageType; import nu.marginalia.index.api.IndexMqClient; import nu.marginalia.index.api.IndexMqEndpoints; import nu.marginalia.mq.MqMessageState; @@ -27,9 +19,20 @@ import nu.marginalia.mqapi.converting.ConvertRequest; import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.mqapi.loading.LoadRequest; +import nu.marginalia.nodecfg.NodeConfigurationService; +import nu.marginalia.process.ProcessOutboxes; +import nu.marginalia.process.ProcessService; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageId; +import nu.marginalia.storage.model.FileStorageState; +import nu.marginalia.storage.model.FileStorageType; +import nu.marginalia.svc.BackupService; +import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.nio.file.Files; import java.sql.SQLException; import java.util.List; @@ -110,9 +113,30 @@ public class ConvertAndLoadActor extends RecordActorPrototype { if (rsp.state() != MqMessageState.OK) yield new Error("Converter failed"); + if (!shouldAutoClean()) { + // If we're not auto-cleaning, we need to clean the NEW flag for the processed storage + storageService.setFileStorageState(processedId, FileStorageState.UNSET); + // (if we do auto-clean, we skip this step and purge the items after loading) + } + yield new Load(List.of(processedId)); } case Load(List processedIds, long msgId) when msgId < 0 -> { + // clear the output directory of the loader from any debris from partial jobs that have been aborted + Files.list(IndexLocations.getIndexConstructionArea(storageService)).forEach(path -> { + try { + if (Files.isDirectory(path)) { + FileUtils.deleteDirectory(path.toFile()); + } + else if (Files.isRegularFile(path)) { + Files.delete(path); + } + } catch (Exception e) { + logger.error("Error clearing staging area", e); + } + }); + + long id = mqLoaderOutbox.sendAsync(new LoadRequest(processedIds)); yield new Load(processedIds, id); @@ -122,9 +146,20 @@ public class ConvertAndLoadActor extends RecordActorPrototype { if (rsp.state() != MqMessageState.OK) { yield new Error("Loader failed"); - } else { - cleanProcessedStorage(processedIds); } + + // If we're auto-cleaning, flag the processed files for deletion if they have the NEW flag, + // indicating they've recently been created. We need to check this, so we don't delete archived + // stuff that's being loaded manually + + if (shouldAutoClean()) { + for (var id : processedIds) { + if (FileStorageState.NEW.equals(storageService.getStorage(id).state())) { + storageService.flagFileForDeletion(id); + } + } + } + yield new Backup(processedIds); } case Backup(List processedIds) -> { @@ -146,7 +181,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype { var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id); if (rsp.state() != MqMessageState.OK) - yield new Error("Repartition failed"); + yield new Error("Forward index construction failed"); else yield new ReindexFull(); } @@ -155,7 +190,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype { var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id); if (rsp.state() != MqMessageState.OK) - yield new Error("Repartition failed"); + yield new Error("Full index construction failed"); else yield new ReindexPrio(); } @@ -164,7 +199,7 @@ public class ConvertAndLoadActor extends RecordActorPrototype { var rsp = processWatcher.waitResponse(mqIndexConstructorOutbox, ProcessService.ProcessId.INDEX_CONSTRUCTOR, id); if (rsp.state() != MqMessageState.OK) - yield new Error("Repartition failed"); + yield new Error("Prio index construction failed"); else yield new SwitchIndex(); } @@ -186,6 +221,16 @@ public class ConvertAndLoadActor extends RecordActorPrototype { return mqIndexConstructorOutbox.sendAsync(new CreateIndexRequest(index)); } + private boolean shouldAutoClean() { + try { + return nodeConfigurationService.get(nodeId).autoClean(); + } + catch (SQLException ex) { + logger.error("Error getting node configuration", ex); + return false; // safe dafault + } + } + @Override public String describe() { @@ -215,24 +260,5 @@ public class ConvertAndLoadActor extends RecordActorPrototype { this.nodeId = serviceConfiguration.node(); } - private void cleanProcessedStorage(List processedStorageId) { - try { - var config = nodeConfigurationService.get(nodeId); - - for (var id : processedStorageId) { - if (FileStorageState.NEW.equals(storageService.getStorage(id).state())) { - if (config.autoClean()) { - storageService.flagFileForDeletion(id); - } - else { - storageService.setFileStorageState(id, FileStorageState.UNSET); - } - } - } - } - catch (SQLException ex) { - logger.error("Error in clean-up", ex); - } - } } diff --git a/code/execution/java/nu/marginalia/process/ProcessService.java b/code/execution/java/nu/marginalia/process/ProcessService.java index 0744267e..30f15f6e 100644 --- a/code/execution/java/nu/marginalia/process/ProcessService.java +++ b/code/execution/java/nu/marginalia/process/ProcessService.java @@ -19,6 +19,8 @@ import org.slf4j.MarkerFactory; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStreamReader; +import java.time.LocalDateTime; +import java.time.format.DateTimeFormatter; import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -32,6 +34,7 @@ public class ProcessService { private final ServiceEventLog eventLog; private final ConcurrentHashMap processes = new ConcurrentHashMap<>(); + private final int node; public static ProcessService.ProcessId translateExternalIdBase(String id) { @@ -78,6 +81,7 @@ public class ProcessService { @Inject public ProcessService(BaseServiceParams params) { this.eventLog = params.eventLog; + this.node = params.configuration.node(); } @@ -86,7 +90,7 @@ public class ProcessService { List args = new ArrayList<>(); String javaHome = System.getProperty("java.home"); - args.add(STR."\{javaHome}/bin/java"); + args.add(javaHome + "/bin/java"); args.add("-cp"); args.add(System.getProperty("java.class.path")); @@ -94,6 +98,7 @@ public class ProcessService { else args.add("-da"); args.add("--enable-preview"); + args.add("--enable-native-access=ALL-UNNAMED"); String loggingOpts = System.getProperty("log4j2.configurationFile"); if (loggingOpts != null) { @@ -104,6 +109,17 @@ public class ProcessService { args.add("-Dsystem.serviceNode=" + System.getProperty("system.serviceNode")); } + if (Boolean.getBoolean("system.profile")) { + // add jfr options + args.add("-XX:+FlightRecorder"); + String jfrFileName = "/var/log/wmsa/profile-%s-%d-%s.jfr".formatted( + processId.toString(), + node, + LocalDateTime.now().format(DateTimeFormatter.ISO_LOCAL_DATE_TIME).replace(':', '.') + ); + args.add("-XX:StartFlightRecording=filename=%s,name=%s".formatted(jfrFileName, processId.toString())); + } + args.addAll(processId.envOpts()); args.add(processId.mainClass); args.addAll(Arrays.asList(extraArgs)); diff --git a/code/execution/java/nu/marginalia/svc/BackupService.java b/code/execution/java/nu/marginalia/svc/BackupService.java index 23b95f6c..e6c2f0da 100644 --- a/code/execution/java/nu/marginalia/svc/BackupService.java +++ b/code/execution/java/nu/marginalia/svc/BackupService.java @@ -2,22 +2,25 @@ package nu.marginalia.svc; import com.github.luben.zstd.ZstdInputStream; import com.github.luben.zstd.ZstdOutputStream; +import com.google.inject.Inject; import nu.marginalia.IndexLocations; +import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.linkdb.LinkdbFileNames; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.FileStorageId; import nu.marginalia.storage.model.FileStorageType; -import nu.marginalia.index.journal.IndexJournalFileNames; +import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; -import com.google.inject.Inject; +import java.io.FileNotFoundException; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.time.LocalDateTime; import java.util.List; +import java.util.Optional; public class BackupService { @@ -97,35 +100,20 @@ public class BackupService { private void backupJournal(Path inputStorage, Path backupStorage) throws IOException { - for (var source : IndexJournalFileNames.findJournalFiles(inputStorage)) { - var dest = backupStorage.resolve(source.toFile().getName()); - - try (var is = Files.newInputStream(source); - var os = Files.newOutputStream(dest) - ) { - IOUtils.copyLarge(is, os); - } + Optional journal = IndexJournal.findJournal(inputStorage); + if (journal.isEmpty()) { + throw new FileNotFoundException("No journal found in input storage"); } + FileUtils.copyDirectory(journal.get().journalDir().toFile(), backupStorage.resolve(journal.get().journalDir().getFileName()).toFile()); } private void restoreJournal(Path destStorage, Path backupStorage) throws IOException { - - // Remove any old journal files first to avoid them getting loaded - for (var garbage : IndexJournalFileNames.findJournalFiles(destStorage)) { - Files.delete(garbage); + Optional journal = IndexJournal.findJournal(backupStorage); + if (journal.isEmpty()) { + throw new FileNotFoundException("No journal found in backup"); } - - for (var source : IndexJournalFileNames.findJournalFiles(backupStorage)) { - var dest = destStorage.resolve(source.toFile().getName()); - - try (var is = Files.newInputStream(source); - var os = Files.newOutputStream(dest) - ) { - IOUtils.copyLarge(is, os); - } - } - + FileUtils.copyDirectory(backupStorage.resolve(journal.get().journalDir().getFileName()).toFile(), destStorage.toFile()); } private void backupFileCompressed(String fileName, Path inputStorage, Path backupStorage) throws IOException diff --git a/code/features-convert/adblock/build.gradle b/code/features-convert/adblock/build.gradle deleted file mode 100644 index d88d86d3..00000000 --- a/code/features-convert/adblock/build.gradle +++ /dev/null @@ -1,33 +0,0 @@ -plugins { - id 'java' - - - id "de.undercouch.download" version "5.1.0" - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation project(':code:common:config') - - implementation libs.bundles.slf4j - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.notnull - implementation libs.jsoup - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/features-convert/adblock/readme.md b/code/features-convert/adblock/readme.md deleted file mode 100644 index 32919300..00000000 --- a/code/features-convert/adblock/readme.md +++ /dev/null @@ -1,8 +0,0 @@ -# Adblock - -Contains an adblock simulator that reads an adblock specifications file and -uses it to identify if a document has ads. - -## Central Classes - -* [AdblockSimulator](java/nu/marginalia/adblock/AdblockSimulator.java) \ No newline at end of file diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java deleted file mode 100644 index aaad9800..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java +++ /dev/null @@ -1,189 +0,0 @@ -package nu.marginalia.keyword; - -import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.keyword.extractors.*; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.language.model.WordRep; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import nu.marginalia.model.EdgeUrl; - -import com.google.inject.Inject; -import java.util.*; -import java.util.stream.Stream; - -public class DocumentKeywordExtractor { - - private final KeywordExtractor keywordExtractor; - private final TermFrequencyDict dict; - private final NgramLexicon ngramLexicon; - - - @Inject - public DocumentKeywordExtractor(TermFrequencyDict dict, NgramLexicon ngramLexicon) { - this.dict = dict; - this.ngramLexicon = ngramLexicon; - this.keywordExtractor = new KeywordExtractor(); - } - - - public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, EdgeUrl url) { - - var bitmask = new KeywordPositionBitmask(keywordExtractor, dld); - var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld); - - var titleKeywords = new TitleKeywords(keywordExtractor, dld); - var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2); - var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld); - var artifactKeywords = new ArtifactKeywords(dld); - var urlKeywords = new UrlKeywords(url); - - var keywordMetadata = KeywordMetadata.builder() - .bitmask(bitmask) - .tfIdfCounts(tfIdfCounts) - .titleKeywords(titleKeywords) - .nameLikeKeywords(nameLikeKeywords) - .subjectLikeKeywords(subjectLikeKeywords) - .urlKeywords(urlKeywords) - .build(); - - DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder(); - - createSimpleWords(wordsBuilder, keywordMetadata, dld); - - createWordsFromSet(wordsBuilder, keywordMetadata, tfIdfCounts); - createWordsFromSet(wordsBuilder, keywordMetadata, titleKeywords); - createWordsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); - createWordsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords); - - var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder); - wordsBuilder.addImportantWords(importantWords); - - wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords()); - - return wordsBuilder; - } - - private static Collection getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) { - return Stream.of(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords) - .flatMap(k -> k.getReps().stream()) - .filter(w -> { - if (w.word.length() < 3) - return false; - if (w.word.contains("_")) - return false; - return true; - }) - .sorted(tfIdfCounts.reversed()) - .limit(16) - .filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100) - .sorted(Comparator.comparing(w -> tfIdfCounts.termFrequencyDictValue(w))) - .limit(6) - .map(w -> w.word) - .toList(); - } - - private void createWordsFromSet(DocumentKeywordsBuilder wordsBuilder, - KeywordMetadata metadata, - WordReps words) { - - for (var rep : words.getReps()) { - - var word = rep.word; - - if (!word.isBlank()) { - long meta = metadata.getMetadataForWord(rep.stemmed); - - assert meta != 0L : "Missing meta for " + rep.word; - - wordsBuilder.add(word, meta); - } - } - } - - - - private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, - KeywordMetadata metadata, - DocumentLanguageData documentLanguageData) - { - for (var sent : documentLanguageData.sentences) { - - if (wordsBuilder.size() > 1500) - break; - - for (var word : sent) { - if (word.isStopWord()) { - continue; - } - - String w = word.wordLowerCase(); - if (matchesWordPattern(w)) { - long meta = metadata.getMetadataForWord(word.stemmed()); - assert meta != 0L : "Missing meta for " + word.word(); - - wordsBuilder.add(w, meta); - } - } - - for (var names : keywordExtractor.getProperNames(sent)) { - var rep = new WordRep(sent, names); - - long meta = metadata.getMetadataForWord(rep.stemmed); - assert meta != 0L : "Missing meta for " + rep.word; - - wordsBuilder.add(rep.word, meta); - } - - for (int i = 0; i < sent.ngrams.length; i++) { - var ngram = sent.ngrams[i]; - var ngramStemmed = sent.ngramStemmed[i]; - - long meta = metadata.getMetadataForWord(ngramStemmed); - assert meta != 0L : "Missing meta for " + ngram; - - wordsBuilder.add(ngram, meta); - } - - } - } - - boolean matchesWordPattern(String s) { - // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} - - String wordPartSeparator = ".-_/:+*"; - - int i = 0; - - for (int run = 0; run < 15 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; - } - - if (i == 0) - return false; - - for (int j = 0; j < 5; j++) { - if (i == s.length()) return true; - - if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { - return false; - } - - i++; - - for (int run = 0; run < 10 && i < s.length(); run++, i++) { - char c = s.charAt(i); - if (c >= 'a' && c <= 'z') continue; - if (c >= 'A' && c <= 'Z') continue; - if (c >= '0' && c <= '9') continue; - break; - } - } - - return false; - } -} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java deleted file mode 100644 index 7160eb04..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java +++ /dev/null @@ -1,64 +0,0 @@ -package nu.marginalia.keyword; - -import lombok.Builder; -import nu.marginalia.keyword.extractors.*; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.model.idx.WordFlags; - -import java.util.EnumSet; - -class KeywordMetadata { - - private final KeywordPositionBitmask bitmask; - private final TitleKeywords titleKeywords; - private final NameLikeKeywords nameLikeKeywords; - private final SubjectLikeKeywords subjectLikeKeywords; - private final UrlKeywords urlKeywords; - private final WordsTfIdfCounts tfIdfCounts; - - @Builder - public KeywordMetadata( - KeywordPositionBitmask bitmask, - TitleKeywords titleKeywords, - NameLikeKeywords nameLikeKeywords, - SubjectLikeKeywords subjectLikeKeywords, - UrlKeywords urlKeywords, - WordsTfIdfCounts tfIdfCounts) { - - this.bitmask = bitmask; - this.titleKeywords = titleKeywords; - this.nameLikeKeywords = nameLikeKeywords; - this.subjectLikeKeywords = subjectLikeKeywords; - this.urlKeywords = urlKeywords; - this.tfIdfCounts = tfIdfCounts; - } - - public long getMetadataForWord(String stemmed) { - - int tfidf = tfIdfCounts.getTfIdf(stemmed); - EnumSet flags = EnumSet.noneOf(WordFlags.class); - - if (tfidf > 100) - flags.add(WordFlags.TfIdfHigh); - - if (subjectLikeKeywords.contains(stemmed)) - flags.add(WordFlags.Subjects); - - if (nameLikeKeywords.contains(stemmed)) - flags.add(WordFlags.NamesWords); - - if (titleKeywords.contains(stemmed)) - flags.add(WordFlags.Title); - - if (urlKeywords.containsUrl(stemmed)) - flags.add(WordFlags.UrlPath); - - if (urlKeywords.containsDomain(stemmed)) - flags.add(WordFlags.UrlDomain); - - long positions = bitmask.get(stemmed); - - return new WordMetadata(positions, flags).encode(); - } - -} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java deleted file mode 100644 index 230c895f..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/KeywordPositionBitmask.java +++ /dev/null @@ -1,105 +0,0 @@ -package nu.marginalia.keyword.extractors; - -import com.google.inject.Inject; -import it.unimi.dsi.fastutil.objects.Object2LongOpenHashMap; -import nu.marginalia.keyword.KeywordExtractor; -import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.model.idx.WordMetadata; - -/** Generates a position bitmask for each word in a document */ -public class KeywordPositionBitmask { - private final Object2LongOpenHashMap positionMask = new Object2LongOpenHashMap<>(10_000, 0.7f); - private final static int positionWidth = WordMetadata.POSITIONS_COUNT; - private final static long positionBitmask = WordMetadata.POSITIONS_MASK; - private static final int unmodulatedPortion = 16; - - @Inject - public KeywordPositionBitmask(KeywordExtractor keywordExtractor, - DocumentLanguageData dld) - { - - // Mark the title words as position 0 - for (var sent : dld.titleSentences) { - int posBit = 1; - - for (var word : sent) { - positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); - } - - for (var ngram : sent.ngramStemmed) { - positionMask.merge(ngram, posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getProperNames(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - } - - // Mark subsequent sentences in subsequent positions, with increasing sentence step size - LinePosition linePos = new LinePosition(); - for (var sent : dld.sentences) { - - long posBit = (1L << linePos.pos()) & positionBitmask; - - for (var word : sent) { - positionMask.merge(word.stemmed(), posBit, this::bitwiseOr); - } - - for (var ngram : sent.ngramStemmed) { - positionMask.merge(ngram, posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getKeywordsFromSentence(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - for (var span : keywordExtractor.getProperNames(sent)) { - positionMask.merge(sent.constructStemmedWordFromSpan(span), posBit, this::bitwiseOr); - } - - linePos.next(sent.length()); - } - } - - public long get(String stemmed) { - return positionMask.getOrDefault(stemmed, 0); - } - - private long bitwiseOr(long a, long b) { - return a | b; - } - - private static class LinePosition { - private int lineLengthCtr = 0; - private int bitMaskPos = 1; - - public int pos() { - if (bitMaskPos < unmodulatedPortion) { - return bitMaskPos; - } - else { - return unmodulatedPortion + ((bitMaskPos - unmodulatedPortion) % (positionWidth - unmodulatedPortion)); - } - } - - public void next(int sentenceLength) - { - if (sentenceLength > 10) { - lineLengthCtr = 0; - ++bitMaskPos; - } - - lineLengthCtr += sentenceLength; - if (lineLengthCtr > 15) { - lineLengthCtr = 0; - ++bitMaskPos; - } - - } - - } -} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java deleted file mode 100644 index f8ad86d7..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java +++ /dev/null @@ -1,68 +0,0 @@ -package nu.marginalia.keyword.model; - -import nu.marginalia.model.idx.WordMetadata; - -import java.io.Serial; -import java.io.Serializable; - -public final class DocumentKeywords implements Serializable { - - @Serial - private static final long serialVersionUID = 1387282293082091432L; - - public final String[] keywords; - public final long[] metadata; - - public DocumentKeywords(String[] keywords, - long[] metadata) - { - this.keywords = keywords; - this.metadata = metadata; - - assert keywords.length == metadata.length; - - if (DocumentKeywords.class.desiredAssertionStatus()) { - for (int i = 0; i < metadata.length; i++) { - if (metadata[i] == 0) { - System.err.println("Bad metadata for keyword " + keywords[i]); - } - } - } - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(getClass().getSimpleName()); - sb.append('['); - var pointer = newPointer(); - while (pointer.advancePointer()) { - sb.append("\n\t "); - - long metadata = pointer.getMetadata(); - String keyword = pointer.getKeyword(); - sb.append(keyword); - - if (metadata != 0) { - sb.append("/").append(new WordMetadata(metadata)); - } - } - return sb.append("\n]").toString(); - } - - public boolean isEmpty() { - return keywords.length == 0; - } - - public int size() { - return keywords.length; - } - - /** Return a pointer for traversing this structure */ - public DocumentKeywordsPointer newPointer() { - return new DocumentKeywordsPointer(this); - } - -} - - diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java deleted file mode 100644 index 49cf3914..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ /dev/null @@ -1,122 +0,0 @@ -package nu.marginalia.keyword.model; - -import it.unimi.dsi.fastutil.objects.Object2LongLinkedOpenHashMap; -import lombok.Getter; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; - -import java.util.*; - -@Getter -public class DocumentKeywordsBuilder { - public final Object2LongLinkedOpenHashMap words; - - /** These ware keywords that had signals of high relevance */ - public final Set importantWords = new HashSet<>(); - - // |------64 letters is this long-------------------------------| - // granted, some of these words are word n-grams, but 64 ought to - // be plenty. The lexicon writer has another limit that's higher. - private final int MAX_WORD_LENGTH = 64; - - public DocumentKeywordsBuilder() { - this(1600); - } - - public DocumentKeywords build() { - final String[] wordArray = new String[words.size()]; - final long[] meta = new long[words.size()]; - - var iter = words.object2LongEntrySet().fastIterator(); - - for (int i = 0; iter.hasNext(); i++) { - var entry = iter.next(); - - meta[i] = entry.getLongValue(); - wordArray[i] = entry.getKey(); - } - - return new DocumentKeywords(wordArray, meta); - } - - public DocumentKeywordsBuilder(int capacity) { - words = new Object2LongLinkedOpenHashMap<>(capacity); - } - - public void add(String word, long meta) { - if (word.length() > MAX_WORD_LENGTH) - return; - - words.put(word, meta); - } - - public void addImportantWords(Collection words) { - importantWords.addAll(words); - } - - public void addJustNoMeta(String word) { - if (word.length() > MAX_WORD_LENGTH) - return; - - words.putIfAbsent(word, 0); - } - - public void setFlagOnMetadataForWords(WordFlags flag, Collection flagWords) { - flagWords.forEach(word -> - words.mergeLong(word, flag.asBit(), (a, b) -> a|b) - ); - } - - public void addAllSyntheticTerms(Collection newWords) { - long meta = WordFlags.Synthetic.asBit(); - - // Only add the synthetic flag if the words aren't already present - - newWords.forEach(word -> words.putIfAbsent(word, meta)); - } - - public void addAnchorTerms(Map keywords) { - long flagA = WordFlags.ExternalLink.asBit(); - long flagB = flagA | WordFlags.Site.asBit(); - long flagC = flagB | WordFlags.SiteAdjacent.asBit(); - - keywords.forEach((word, count) -> { - if (count > 5) { - words.mergeLong(word, flagC, (a, b) -> a|b); - } else if (count > 2) { - words.mergeLong(word, flagB, (a, b) -> a|b); - } else { - words.mergeLong(word, flagA, (a, b) -> a|b); - } - }); - } - - public List getWordsWithAnyFlag(long flags) { - List ret = new ArrayList<>(); - - for (var iter = words.object2LongEntrySet().fastIterator(); iter.hasNext();) { - var entry = iter.next(); - if ((flags & entry.getLongValue()) != 0) { - ret.add(entry.getKey()); - } - } - - return ret; - } - - public int size() { - return words.size(); - } - - public WordMetadata getMetaForWord(String word) { - return new WordMetadata(words.getLong(word)); - } - @Override - public String toString() { - StringBuilder sb = new StringBuilder("[ "); - words.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' ')); - return sb.append(']').toString(); - - } - -} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java b/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java deleted file mode 100644 index 2bc068d9..00000000 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsPointer.java +++ /dev/null @@ -1,41 +0,0 @@ -package nu.marginalia.keyword.model; - -/** Pointer into a {@see DocumentKeywords}. It starts out before the first position, - * forward with advancePointer(). - * */ -public class DocumentKeywordsPointer { - private int pos = -1; - - private final DocumentKeywords keywords; - - DocumentKeywordsPointer(DocumentKeywords keywords) { - this.keywords = keywords; - } - - /** Number of positions remaining */ - public int remaining() { - return keywords.size() - Math.max(0, pos); - } - - /** Return the keyword associated with the current position */ - public String getKeyword() { - return keywords.keywords[pos]; - } - - /** Return the metadata associated with the current position */ - public long getMetadata() { - return keywords.metadata[pos]; - } - - /** Advance the current position, - * returns false if this was the - * last position */ - public boolean advancePointer() { - return ++pos < keywords.size(); - } - - /** Returns true unless the pointer is beyond the last position in the keyword set */ - public boolean hasMore() { - return pos + 1 < keywords.size(); - } -} diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java deleted file mode 100644 index 54577f80..00000000 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java +++ /dev/null @@ -1,149 +0,0 @@ -package nu.marginalia.keyword; - -import nu.marginalia.WmsaHome; -import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import org.jsoup.Jsoup; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.net.URISyntaxException; -import java.nio.charset.Charset; -import java.util.HashMap; -import java.util.Map; -import java.util.Objects; -import java.util.Set; - -class DocumentKeywordExtractorTest { - - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), - new NgramLexicon(WmsaHome.getLanguageModels())); - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - - @Test - public void testWordPattern() { - Assertions.assertTrue(extractor.matchesWordPattern("test")); - Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde")); - Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef")); - - Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test")); - Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test")); - Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24")); - Assertions.assertTrue(extractor.matchesWordPattern("std::vector")); - Assertions.assertTrue(extractor.matchesWordPattern("c++")); - Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h")); - Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); - } - - - @Test - public void testEmptyMetadata() throws URISyntaxException { - var dld = se.extractSentences(""" - Some sample text, I'm not sure what even triggers this - """, "A title perhaps?"); - var keywordBuilder = extractor.extractKeywords(dld, new EdgeUrl("https://www.example.com/invalid")); - var keywords = keywordBuilder.build(); - - var pointer = keywords.newPointer(); - while (pointer.advancePointer()) { - if (pointer.getMetadata() == 0L) { - System.out.println("Aha! " + pointer.getKeyword()); - } - } - - } - - @Test - public void testKeyboards2() throws IOException, URISyntaxException { - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), - "Could not load word frequency table"); - String html = new String(resource.readAllBytes(), Charset.defaultCharset()); - var doc = Jsoup.parse(html); - doc.filter(new DomPruningFilter(0.5)); - - var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); - - keywords.getWords().forEach((k, v) -> { - if (k.contains("_")) { - System.out.println(k + " " + new WordMetadata(v)); - } - }); - } - @Test - public void testKeyboards() throws IOException, URISyntaxException { - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), - "Could not load word frequency table"); - String html = new String(resource.readAllBytes(), Charset.defaultCharset()); - var doc = Jsoup.parse(html); - doc.filter(new DomPruningFilter(0.5)); - - var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); - System.out.println(keywords.getMetaForWord("mechanical")); - System.out.println(keywords.getMetaForWord("keyboard")); - System.out.println(keywords.getMetaForWord("keyboards")); - - System.out.println(new WordMetadata(8894889328781L)); - System.out.println(new WordMetadata(4294967297L)); - System.out.println(new WordMetadata(566820053975498886L)); - // - - System.out.println(new WordMetadata(1198298103937L)); - System.out.println(new WordMetadata(1103808168065L)); - } - - @Test - public void testMadonna() throws IOException, URISyntaxException { - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"), - "Could not load word frequency table"); - String html = new String(resource.readAllBytes(), Charset.defaultCharset()); - var doc = Jsoup.parse(html); - doc.filter(new DomPruningFilter(0.5)); - - var keywords = extractor.extractKeywords( - se.extractSentences(doc), - new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)") - ); - - var keywordsBuilt = keywords.build(); - var ptr = keywordsBuilt.newPointer(); - - Map dirtyAndBlues = new HashMap<>(); - - while (ptr.advancePointer()) { - if (Set.of("dirty", "blues").contains(ptr.getKeyword())) { - Assertions.assertNull( - dirtyAndBlues.put(ptr.getKeyword(), new WordMetadata(ptr.getMetadata())) - ); - } - } - - Assertions.assertTrue(dirtyAndBlues.containsKey("dirty")); - Assertions.assertTrue(dirtyAndBlues.containsKey("blues")); - Assertions.assertNotEquals( - dirtyAndBlues.get("dirty"), - dirtyAndBlues.get("blues") - ); - } - - @Test - public void testSpam() throws IOException, URISyntaxException { - var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"), - "Could not load word frequency table"); - String html = new String(resource.readAllBytes(), Charset.defaultCharset()); - var doc = Jsoup.parse(html); - doc.filter(new DomPruningFilter(0.5)); - - DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), - new NgramLexicon(WmsaHome.getLanguageModels())); - SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - - var keywords = extractor.extractKeywords(se.extractSentences(doc), new EdgeUrl("https://math.byu.edu/wiki/index.php/All_You_Need_To_Know_About_Earning_Money_Online")); - System.out.println(keywords.getMetaForWord("knitting")); - } -} \ No newline at end of file diff --git a/code/features-convert/pubdate/build.gradle b/code/features-convert/pubdate/build.gradle deleted file mode 100644 index aeafcd99..00000000 --- a/code/features-convert/pubdate/build.gradle +++ /dev/null @@ -1,34 +0,0 @@ -plugins { - id 'java' - - - id "de.undercouch.download" version "5.1.0" - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation project(':code:common:model') - - implementation libs.bundles.slf4j - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.notnull - implementation libs.bundles.gson - implementation libs.jsoup - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito - testImplementation project(':code:common:config') -} diff --git a/code/features-convert/pubdate/readme.md b/code/features-convert/pubdate/readme.md deleted file mode 100644 index add657ee..00000000 --- a/code/features-convert/pubdate/readme.md +++ /dev/null @@ -1,7 +0,0 @@ -# Pubdate - -Contains advanced haruspicy for figuring out when a document was published. - -## Central Classes - -* [PubDateSniffer](java/nu/marginalia/pubdate/PubDateSniffer.java) \ No newline at end of file diff --git a/code/features-convert/readme.md b/code/features-convert/readme.md deleted file mode 100644 index 2979fdab..00000000 --- a/code/features-convert/readme.md +++ /dev/null @@ -1,13 +0,0 @@ -# Converter Features - -## Major features - -* [keyword-extraction](keyword-extraction/) - Identifies keywords to index in a document -* [summary-extraction](summary-extraction/) - Generate an excerpt/quote from a website to display on the search results page. - - -## Smaller features: - -* [adblock](adblock/) - Simulates Adblock -* [pubdate](pubdate/) - Determines when a document was published -* [topic-detection](topic-detection/) - Tries to identify the topic of a website diff --git a/code/features-convert/reddit-json/build.gradle b/code/features-convert/reddit-json/build.gradle deleted file mode 100644 index fed33f4f..00000000 --- a/code/features-convert/reddit-json/build.gradle +++ /dev/null @@ -1,44 +0,0 @@ -plugins { - id 'java' - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation project(':code:libraries:blocking-thread-pool') - implementation project(':code:common:model') - implementation libs.notnull - - implementation libs.jsoup - implementation libs.sqlite - - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.guava - implementation libs.gson - implementation libs.zstd - implementation libs.trove - implementation libs.commons.compress - implementation libs.xz - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - -test { - maxHeapSize = "8G" - useJUnitPlatform() -} diff --git a/code/features-convert/stackexchange-xml/build.gradle b/code/features-convert/stackexchange-xml/build.gradle deleted file mode 100644 index 62e289b0..00000000 --- a/code/features-convert/stackexchange-xml/build.gradle +++ /dev/null @@ -1,43 +0,0 @@ -plugins { - id 'java' - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation project(':code:libraries:blocking-thread-pool') - implementation project(':code:common:model') - implementation libs.notnull - - implementation libs.jsoup - implementation libs.sqlite - - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.guava - implementation libs.zstd - implementation libs.trove - implementation libs.commons.compress - implementation libs.xz - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - -test { - maxHeapSize = "8G" - useJUnitPlatform() -} diff --git a/code/features-convert/stackexchange-xml/readme.md b/code/features-convert/stackexchange-xml/readme.md deleted file mode 100644 index 1701ad7f..00000000 --- a/code/features-convert/stackexchange-xml/readme.md +++ /dev/null @@ -1,18 +0,0 @@ -Stackexchange's data is a jumble of questions and answers, -where the answers refer to the questions with a parentId field. - -e.g. -```xml - - - - - - -``` - -Since the search engine wants to extract keywords for each thread -holistically, not by question or answer, it is necessary to re-arrange -the data (which is very large). SQLite does a decent job of enabling -this task. - diff --git a/code/features-convert/summary-extraction/build.gradle b/code/features-convert/summary-extraction/build.gradle deleted file mode 100644 index 24eec1ca..00000000 --- a/code/features-convert/summary-extraction/build.gradle +++ /dev/null @@ -1,42 +0,0 @@ -plugins { - id 'java' - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation libs.notnull - - implementation libs.jsoup - - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.guava - implementation libs.bundles.gson - implementation libs.trove - implementation libs.fastutil - implementation libs.commons.lang3 - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito - - testImplementation project(':code:features-convert:keyword-extraction') - testImplementation project(':code:libraries:language-processing') - testImplementation project(':code:libraries:term-frequency-dict') - testImplementation project(':code:common:config') - testImplementation project(':code:common:model') -} - diff --git a/code/features-convert/summary-extraction/readme.md b/code/features-convert/summary-extraction/readme.md deleted file mode 100644 index b617d947..00000000 --- a/code/features-convert/summary-extraction/readme.md +++ /dev/null @@ -1,25 +0,0 @@ -# Summary Extraction - -This feature attempts to find a descriptive passage of text that summarizes -what a search result "is about". It's the text you see below a search result. - -It must solve two problems: - -1. Identify which part of the document that contains "the text". -The crux is that the document may be anywhere from 1993 to the present, with era-appropriate -formatting. It may be formatted with <center>ed <font>-tags, or semantic HTML5. - -2. Identify which part of "the text" best describes the document. - -It uses several naive heuristics to try to find something that makes sense, -and there is probably room for improvement. - -There are many good techniques for doing this, but they've sadly not proved -particularly fast. Whatever solution is used needs to be able to summarize of -order of a 100,000,000 documents with a time budget of a couple of hours. - - -## Central Classes - -* [SummaryExtractor](java/nu/marginalia/summary/SummaryExtractor.java) - diff --git a/code/features-convert/topic-detection/build.gradle b/code/features-convert/topic-detection/build.gradle deleted file mode 100644 index ef29d275..00000000 --- a/code/features-convert/topic-detection/build.gradle +++ /dev/null @@ -1,34 +0,0 @@ -plugins { - id 'java' - - - id "de.undercouch.download" version "5.1.0" - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} - -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation project(':code:common:config') - implementation project(':code:libraries:language-processing') - implementation project(':third-party:porterstemmer') - - implementation libs.bundles.slf4j - implementation libs.guava - implementation dependencies.create(libs.guice.get()) { - exclude group: 'com.google.guava' - } - implementation libs.notnull - implementation libs.jsoup - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} diff --git a/code/features-convert/topic-detection/readme.md b/code/features-convert/topic-detection/readme.md deleted file mode 100644 index db9a0000..00000000 --- a/code/features-convert/topic-detection/readme.md +++ /dev/null @@ -1,4 +0,0 @@ -# Topic Detection - -This is an experiment in using hand-crafted naive bayesian filters to detecting the topic of a website. -It's noteworthy it detects recipes very well. \ No newline at end of file diff --git a/code/features-crawl/readme.md b/code/features-crawl/readme.md deleted file mode 100644 index 4566e980..00000000 --- a/code/features-crawl/readme.md +++ /dev/null @@ -1,8 +0,0 @@ -# Crawl Features - -These are bits of search-engine related code that are relatively isolated pieces of business logic, -that benefit from the clarity of being kept separate from the rest of the crawling code. - -* [content-type](content-type/) - Content Type identification -* [crawl-blocklist](crawl-blocklist/) - IP and URL blocklists -* [link-parser](link-parser/) - Code for parsing and normalizing links diff --git a/code/functions/math/api/java/nu/marginalia/api/math/MathClient.java b/code/functions/math/api/java/nu/marginalia/api/math/MathClient.java index ee0a55cd..8d98429d 100644 --- a/code/functions/math/api/java/nu/marginalia/api/math/MathClient.java +++ b/code/functions/math/api/java/nu/marginalia/api/math/MathClient.java @@ -2,6 +2,11 @@ package nu.marginalia.api.math; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.api.math.MathProtobufCodec.DictionaryLookup; +import nu.marginalia.api.math.MathProtobufCodec.EvalMath; +import nu.marginalia.api.math.MathProtobufCodec.SpellCheck; +import nu.marginalia.api.math.MathProtobufCodec.UnitConversion; +import nu.marginalia.api.math.model.DictionaryResponse; import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcSingleNodeChannelPool; import nu.marginalia.service.discovery.property.ServiceKey; @@ -9,14 +14,11 @@ import nu.marginalia.service.discovery.property.ServicePartition; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.time.Duration; -import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.*; - -import nu.marginalia.api.math.model.*; -import nu.marginalia.api.math.MathProtobufCodec.*; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; @Singleton @@ -49,24 +51,14 @@ public class MathClient { .thenApply(SpellCheck::convertResponse); } - public Map> spellCheck(List words, Duration timeout) throws InterruptedException { + // This looks a bit different because we need to spell check multiple words, and we want to do it in parallel + public Future>> spellCheck(List words) throws InterruptedException { List requests = words.stream().map(SpellCheck::createRequest).toList(); - var future = channelPool.call(MathApiGrpc.MathApiBlockingStub::spellCheck) + return channelPool.call(MathApiGrpc.MathApiBlockingStub::spellCheck) .async(executor) - .runFor(requests); - - try { - var results = future.get(); - Map> map = new HashMap<>(); - for (int i = 0; i < words.size(); i++) { - map.put(words.get(i), SpellCheck.convertResponse(results.get(i))); - } - return map; - } - catch (ExecutionException e) { - throw new RuntimeException(e); - } + .runFor(requests) + .thenApply(rsp -> SpellCheck.convertResponses(words, rsp)); } public Future unitConversion(String value, String from, String to) { diff --git a/code/functions/math/api/java/nu/marginalia/api/math/MathProtobufCodec.java b/code/functions/math/api/java/nu/marginalia/api/math/MathProtobufCodec.java index 2b865b21..ec077e6b 100644 --- a/code/functions/math/api/java/nu/marginalia/api/math/MathProtobufCodec.java +++ b/code/functions/math/api/java/nu/marginalia/api/math/MathProtobufCodec.java @@ -3,7 +3,9 @@ package nu.marginalia.api.math; import nu.marginalia.api.math.model.DictionaryEntry; import nu.marginalia.api.math.model.DictionaryResponse; +import java.util.HashMap; import java.util.List; +import java.util.Map; public class MathProtobufCodec { @@ -35,6 +37,15 @@ public class MathProtobufCodec { public static List convertResponse(RpcSpellCheckResponse rsp) { return rsp.getSuggestionsList(); } + + + public static Map> convertResponses(List words, List responses) { + var map = new HashMap>(); + for (int i = 0; i < words.size(); i++) { + map.put(words.get(i), responses.get(i).getSuggestionsList()); + } + return map; + } } public static class UnitConversion { diff --git a/code/functions/search-query/api/build.gradle b/code/functions/search-query/api/build.gradle index b85497cc..a589f52f 100644 --- a/code/functions/search-query/api/build.gradle +++ b/code/functions/search-query/api/build.gradle @@ -23,6 +23,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:service') implementation project(':code:index:query') + implementation project(':code:libraries:language-processing') implementation libs.bundles.slf4j diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java index af783a83..bd421bfc 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/IndexProtobufCodec.java @@ -1,11 +1,9 @@ package nu.marginalia.api.searchquery; +import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; @@ -48,11 +46,22 @@ public class IndexProtobufCodec { } public static SearchQuery convertRpcQuery(RpcQuery query) { - List> coherences = new ArrayList<>(); + List phraeConstraints = new ArrayList<>(); - for (int j = 0; j < query.getCoherencesCount(); j++) { - var coh = query.getCoherences(j); - coherences.add(new ArrayList<>(coh.getCoherencesList())); + for (int j = 0; j < query.getPhrasesCount(); j++) { + var coh = query.getPhrases(j); + if (coh.getType() == RpcPhrases.TYPE.OPTIONAL) { + phraeConstraints.add(new SearchPhraseConstraint.Optional(List.copyOf(coh.getTermsList()))); + } + else if (coh.getType() == RpcPhrases.TYPE.MANDATORY) { + phraeConstraints.add(new SearchPhraseConstraint.Mandatory(List.copyOf(coh.getTermsList()))); + } + else if (coh.getType() == RpcPhrases.TYPE.FULL) { + phraeConstraints.add(new SearchPhraseConstraint.Full(List.copyOf(coh.getTermsList()))); + } + else { + throw new IllegalArgumentException("Unknown phrase constraint type: " + coh.getType()); + } } return new SearchQuery( @@ -61,7 +70,7 @@ public class IndexProtobufCodec { query.getExcludeList(), query.getAdviceList(), query.getPriorityList(), - coherences + phraeConstraints ); } @@ -74,8 +83,21 @@ public class IndexProtobufCodec { .addAllExclude(searchQuery.getSearchTermsExclude()) .addAllPriority(searchQuery.getSearchTermsPriority()); - for (var coherences : searchQuery.searchTermCoherences) { - subqueryBuilder.addCoherencesBuilder().addAllCoherences(coherences); + for (var constraint : searchQuery.phraseConstraints) { + switch (constraint) { + case SearchPhraseConstraint.Optional(List terms) -> + subqueryBuilder.addPhrasesBuilder() + .addAllTerms(terms) + .setType(RpcPhrases.TYPE.OPTIONAL); + case SearchPhraseConstraint.Mandatory(List terms) -> + subqueryBuilder.addPhrasesBuilder() + .addAllTerms(terms) + .setType(RpcPhrases.TYPE.MANDATORY); + case SearchPhraseConstraint.Full(List terms) -> + subqueryBuilder.addPhrasesBuilder() + .addAllTerms(terms) + .setType(RpcPhrases.TYPE.FULL); + } } return subqueryBuilder.build(); @@ -86,19 +108,17 @@ public class IndexProtobufCodec { return ResultRankingParameters.sensibleDefaults(); return new ResultRankingParameters( - new Bm25Parameters(params.getFullK(), params.getFullB()), - new Bm25Parameters(params.getPrioK(), params.getPrioB()), + new Bm25Parameters(params.getBm25K(), params.getBm25B()), params.getShortDocumentThreshold(), params.getShortDocumentPenalty(), params.getDomainRankBonus(), params.getQualityPenalty(), params.getShortSentenceThreshold(), params.getShortSentencePenalty(), - params.getBm25FullWeight(), - params.getBm25NgramWeight(), - params.getBm25PrioWeight(), - params.getTcfJaccardWeight(), - params.getTcfOverlapWeight(), + params.getBm25Weight(), + params.getTcfFirstPositionWeight(), + params.getTcfVerbatimWeight(), + params.getTcfProximityWeight(), ResultRankingParameters.TemporalBias.valueOf(params.getTemporalBias().getBias().name()), params.getTemporalBiasWeight(), params.getExportDebugData() @@ -113,21 +133,18 @@ public class IndexProtobufCodec { } var builder = RpcResultRankingParameters.newBuilder() - .setFullB(rankingParams.fullParams.b()) - .setFullK(rankingParams.fullParams.k()) - .setPrioB(rankingParams.prioParams.b()) - .setPrioK(rankingParams.prioParams.k()) + .setBm25B(rankingParams.bm25Params.b()) + .setBm25K(rankingParams.bm25Params.k()) .setShortDocumentThreshold(rankingParams.shortDocumentThreshold) .setShortDocumentPenalty(rankingParams.shortDocumentPenalty) .setDomainRankBonus(rankingParams.domainRankBonus) .setQualityPenalty(rankingParams.qualityPenalty) .setShortSentenceThreshold(rankingParams.shortSentenceThreshold) .setShortSentencePenalty(rankingParams.shortSentencePenalty) - .setBm25FullWeight(rankingParams.bm25FullWeight) - .setBm25NgramWeight(rankingParams.bm25NgramWeight) - .setBm25PrioWeight(rankingParams.bm25PrioWeight) - .setTcfOverlapWeight(rankingParams.tcfOverlapWeight) - .setTcfJaccardWeight(rankingParams.tcfJaccardWeight) + .setBm25Weight(rankingParams.bm25Weight) + .setTcfFirstPositionWeight(rankingParams.tcfFirstPosition) + .setTcfProximityWeight(rankingParams.tcfProximity) + .setTcfVerbatimWeight(rankingParams.tcfVerbatim) .setTemporalBiasWeight(rankingParams.temporalBiasWeight) .setExportDebugData(rankingParams.exportDebugData); @@ -142,45 +159,4 @@ public class IndexProtobufCodec { return builder.build(); } - - public static RpcResultRankingDetails convertRankingDetails(ResultRankingDetails rankingDetails) { - if (rankingDetails == null) { - return null; - } - - return RpcResultRankingDetails.newBuilder() - .setInputs(convertRankingInputs(rankingDetails.inputs())) - .setOutput(convertRankingOutput(rankingDetails.outputs())) - .build(); - } - - private static RpcResultRankingOutputs convertRankingOutput(ResultRankingOutputs outputs) { - return RpcResultRankingOutputs.newBuilder() - .setAverageSentenceLengthPenalty(outputs.averageSentenceLengthPenalty()) - .setQualityPenalty(outputs.qualityPenalty()) - .setRankingBonus(outputs.rankingBonus()) - .setTopologyBonus(outputs.topologyBonus()) - .setDocumentLengthPenalty(outputs.documentLengthPenalty()) - .setTemporalBias(outputs.temporalBias()) - .setFlagsPenalty(outputs.flagsPenalty()) - .setOverallPart(outputs.overallPart()) - .setTcfOverlap(outputs.tcfOverlap()) - .setTcfJaccard(outputs.tcfJaccard()) - .setBM25F(outputs.bM25F()) - .setBM25N(outputs.bM25N()) - .setBM25P(outputs.bM25P()) - .build(); - } - - private static RpcResultRankingInputs convertRankingInputs(ResultRankingInputs inputs) { - return RpcResultRankingInputs.newBuilder() - .setRank(inputs.rank()) - .setAsl(inputs.asl()) - .setQuality(inputs.quality()) - .setSize(inputs.size()) - .setTopology(inputs.topology()) - .setYear(inputs.year()) - .addAllFlags(inputs.flags()) - .build(); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java index 58a20a8a..e6e62dc3 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/QueryProtobufCodec.java @@ -1,21 +1,25 @@ package nu.marginalia.api.searchquery; import lombok.SneakyThrows; +import nu.marginalia.api.searchquery.model.query.ProcessedQuery; +import nu.marginalia.api.searchquery.model.query.QueryParams; +import nu.marginalia.api.searchquery.model.query.QueryResponse; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; +import nu.marginalia.api.searchquery.model.results.debug.DebugFactor; +import nu.marginalia.api.searchquery.model.results.debug.DebugFactorGroup; +import nu.marginalia.api.searchquery.model.results.debug.DebugTermFactorGroup; import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.api.searchquery.model.query.ProcessedQuery; -import nu.marginalia.api.searchquery.model.query.QueryParams; -import nu.marginalia.api.searchquery.model.query.QueryResponse; import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; public class QueryProtobufCodec { @@ -130,6 +134,7 @@ public class QueryProtobufCodec { results.getWordsTotal(), results.getBestPositions(), results.getRankingScore(), + results.getResultsFromDomain(), convertRankingDetails(results.getRankingDetails()) ); } @@ -137,46 +142,109 @@ public class QueryProtobufCodec { private static ResultRankingDetails convertRankingDetails(RpcResultRankingDetails rankingDetails) { if (rankingDetails == null) return null; - var inputs = rankingDetails.getInputs(); - var outputs = rankingDetails.getOutput(); + + var docData = rankingDetails.getDocumentOutputs(); + var termData = rankingDetails.getTermOutputs(); return new ResultRankingDetails( - convertRankingInputs(inputs), - convertRankingOutputs(outputs) + convertDocumentOutputs(docData), + convertTermData(termData) ); } - private static ResultRankingOutputs convertRankingOutputs(RpcResultRankingOutputs outputs) { - return new ResultRankingOutputs( - outputs.getAverageSentenceLengthPenalty(), - outputs.getQualityPenalty(), - outputs.getRankingBonus(), - outputs.getTopologyBonus(), - outputs.getDocumentLengthPenalty(), - outputs.getTemporalBias(), - outputs.getFlagsPenalty(), - outputs.getOverallPart(), - outputs.getTcfOverlap(), - outputs.getTcfJaccard(), - outputs.getBM25F(), - outputs.getBM25N(), - outputs.getBM25P() - ); + private static List convertTermData(RpcResultTermRankingOutputs termData) { + Map termIdByName = new HashMap<>(); + Map> factorsByTerm = new HashMap<>(); + + for (int i = 0; i < termData.getTermCount(); i++) { + termIdByName.put(termData.getTerm(i), termData.getTermId(i)); + factorsByTerm.computeIfAbsent(termData.getTerm(i), k -> new ArrayList<>()) + .add(new DebugFactor(termData.getFactor(i), termData.getValue(i))); + } + + Map> factorGroupsByTerm = new HashMap<>(); + for (var entry : factorsByTerm.entrySet()) { + String term = entry.getKey(); + var factorsList = entry.getValue(); + + Map> factorsByGroup = new HashMap<>(); + + for (var factor : factorsList) { + String[] parts = factor.factor().split("\\."); + + String group, name; + + if (parts.length != 2) { + group = "unknown"; + name = parts[0]; + } else { + group = parts[0]; + name = parts[1]; + } + + + factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>()) + .add(new DebugFactor(name, factor.value())); + } + + factorsByGroup.forEach((groupName, groupData) -> { + factorGroupsByTerm.computeIfAbsent(term, k -> new ArrayList<>()) + .add(new DebugFactorGroup(groupName, groupData)); + }); + + } + + List groups = new ArrayList<>(); + + for (var entry : factorGroupsByTerm.entrySet()) { + groups.add(new DebugTermFactorGroup(entry.getKey(), termIdByName.get(entry.getKey()), entry.getValue())); + } + + return groups; } - private static ResultRankingInputs convertRankingInputs(RpcResultRankingInputs inputs) { - return new ResultRankingInputs( - inputs.getRank(), - inputs.getAsl(), - inputs.getQuality(), - inputs.getSize(), - inputs.getTopology(), - inputs.getYear(), - inputs.getFlagsList() - ); + private static List convertDocumentOutputs(RpcResultDocumentRankingOutputs docData) { + + List unclusteredFactors = new ArrayList<>(); + for (int i = 0; i < docData.getFactorCount(); i++) { + String factor = docData.getFactor(i); + String value = docData.getValue(i); + unclusteredFactors.add(new DebugFactor(factor, value)); + } + + Map> factorsByGroup = new HashMap<>(); + + for (var factor : unclusteredFactors) { + String factorName = factor.factor(); + String value = factor.value(); + + String[] parts = factorName.split("\\."); + + String group, name; + + if (parts.length != 2) { + group = "unknown"; + name = factorName; + } + else { + group = parts[0]; + name = parts[1]; + } + + factorsByGroup.computeIfAbsent(group, k -> new ArrayList<>()) + .add(new DebugFactor(name, value)); + } + + List groups = new ArrayList<>(); + for (var entry : factorsByGroup.entrySet()) { + groups.add(new DebugFactorGroup(entry.getKey(), entry.getValue())); + } + + return groups; } + private static SearchResultItem convertRawResult(RpcRawResultItem rawItem) { var keywordScores = new ArrayList(rawItem.getKeywordScoresCount()); @@ -188,8 +256,9 @@ public class QueryProtobufCodec { rawItem.getEncodedDocMetadata(), rawItem.getHtmlFeatures(), keywordScores, - rawItem.getResultsFromDomain(), rawItem.getHasPriorityTerms(), + 0, // Not set + null, // Not set Double.NaN // Not set ); } @@ -198,7 +267,8 @@ public class QueryProtobufCodec { return new SearchResultKeywordScore( keywordScores.getKeyword(), -1, // termId is internal to index service - keywordScores.getEncodedWordMetadata() + (byte) keywordScores.getFlags(), + keywordScores.getPositions() ); } @@ -257,6 +327,7 @@ public class QueryProtobufCodec { rpcDecoratedResultItem.getWordsTotal(), rpcDecoratedResultItem.getBestPositions(), rpcDecoratedResultItem.getRankingScore(), + rpcDecoratedResultItem.getResultsFromDomain(), convertRankingDetails(rpcDecoratedResultItem.getRankingDetails()) ); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java index 356a1d86..775d63fb 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQuery.java @@ -3,7 +3,9 @@ package nu.marginalia.api.searchquery.model.compiled; import org.jetbrains.annotations.NotNull; import java.util.Iterator; -import java.util.function.*; +import java.util.function.Function; +import java.util.function.ToIntFunction; +import java.util.function.ToLongFunction; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -46,8 +48,8 @@ public class CompiledQuery implements Iterable { return new CompiledQueryLong(root, data.mapToLong(mapper)); } - public CompiledQueryLong mapToInt(ToIntFunction mapper) { - return new CompiledQueryLong(root, data.mapToInt(mapper)); + public CompiledQueryInt mapToInt(ToIntFunction mapper) { + return new CompiledQueryInt(root, data.mapToInt(mapper)); } public CqExpression root() { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java index 9e26c35c..0f80d479 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryInt.java @@ -5,8 +5,8 @@ import java.util.stream.IntStream; /** A compiled index service query */ public class CompiledQueryInt { - private final CqExpression root; - private final CqDataInt data; + public final CqExpression root; + public final CqDataInt data; public CompiledQueryInt(CqExpression root, CqDataInt data) { this.root = root; @@ -26,7 +26,7 @@ public class CompiledQueryInt { return IntStream.range(0, data.size()); } - public long at(int index) { + public int at(int index) { return data.get(index); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java index ae197fb9..ef379e5a 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParser.java @@ -61,7 +61,8 @@ public class CompiledQueryParser { String[] cqData = new String[wordIds.size()]; wordIds.forEach((w, i) -> cqData[i] = w); - return new CompiledQuery<>(root, new CqData<>(cqData)); + + return root.newQuery(cqData); } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java index 145f3f0f..63f7301b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqData.java @@ -33,13 +33,13 @@ public class CqData { return new CqDataLong(newData); } - public CqDataLong mapToInt(ToIntFunction mapper) { - long[] newData = new long[data.length]; + public CqDataInt mapToInt(ToIntFunction mapper) { + int[] newData = new int[data.length]; for (int i = 0; i < data.length; i++) { - newData[i] = mapper.applyAsInt((T) data[i]); + newData[i] = mapper.applyAsInt(data[i]); } - return new CqDataLong(newData); + return new CqDataInt(newData); } public T get(int i) { diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java index e9972526..3f0cca50 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/CqExpression.java @@ -8,6 +8,18 @@ import java.util.stream.Stream; * */ public sealed interface CqExpression { + /** Create a new query for the provided data using this expression as the root */ + default CompiledQuery newQuery(T[] data) { + return new CompiledQuery<>(this, data); + } + /** Create a new query for the provided data using this expression as the root */ + default CompiledQueryInt newQuery(int[] data) { + return new CompiledQueryInt(this, new CqDataInt(data)); + } + /** Create a new query for the provided data using this expression as the root */ + default CompiledQueryLong newQuery(long[] data) { + return new CompiledQueryLong(this, new CqDataLong(data)); + } Stream stream(); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 7e8ca8ec..7dd48394 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import java.util.ArrayList; @@ -36,7 +37,10 @@ public class CompiledQueryAggregates { public static int intMaxMinAggregate(CompiledQuery query, ToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); } - + /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ + public static int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) { + return query.root.visit(new CqIntMaxMinOperator(query, operator)); + } /** Apply the operator to each leaf node, then return the highest minimum value found along any path */ public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) { return query.root.visit(new CqIntMaxMinOperator(query, operator)); @@ -55,13 +59,4 @@ public class CompiledQueryAggregates { return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); } - /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ - public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { - return query.root().visit(new CqPositionsOperator(query, operator)); - } - - /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ - public static LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) { - return query.root().visit(new CqPositionsOperator(query, operator)); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java index 621dff73..c9712ed4 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqIntMaxMinOperator.java @@ -1,6 +1,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqExpression; @@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor { public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) { this.operator = idx -> operator.applyAsInt(query.at(idx)); } - + public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) { + this.operator = idx -> operator.applyAsInt(query.at(idx)); + } @Override public int onAnd(List parts) { int value = parts.getFirst().visit(this); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java deleted file mode 100644 index 715c4cb2..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.api.searchquery.model.compiled.aggregate; - -import it.unimi.dsi.fastutil.longs.LongArraySet; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; -import it.unimi.dsi.fastutil.longs.LongSet; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.CqExpression; - -import java.util.List; -import java.util.function.IntToLongFunction; -import java.util.function.LongUnaryOperator; -import java.util.function.ToLongFunction; - -public class CqPositionsOperator implements CqExpression.ObjectVisitor { - private final IntToLongFunction operator; - - public CqPositionsOperator(CompiledQuery query, ToLongFunction operator) { - this.operator = idx -> operator.applyAsLong(query.at(idx)); - } - - public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) { - this.operator = idx -> operator.applyAsLong(query.at(idx)); - } - - @Override - public LongSet onAnd(List parts) { - LongSet ret = new LongArraySet(); - - for (var part : parts) { - ret = comineSets(ret, part.visit(this)); - } - - return ret; - } - - private LongSet comineSets(LongSet a, LongSet b) { - if (a.isEmpty()) - return b; - if (b.isEmpty()) - return a; - - LongSet ret = newSet(a.size() * b.size()); - - var ai = a.longIterator(); - - while (ai.hasNext()) { - long aval = ai.nextLong(); - - var bi = b.longIterator(); - while (bi.hasNext()) { - ret.add(aval & bi.nextLong()); - } - } - - return ret; - } - - @Override - public LongSet onOr(List parts) { - LongSet ret = newSet(parts.size()); - - for (var part : parts) { - ret.addAll(part.visit(this)); - } - - return ret; - } - - @Override - public LongSet onLeaf(int idx) { - var set = newSet(1); - set.add(operator.applyAsLong(idx)); - return set; - } - - /** Allocate a new set suitable for a collection with the provided cardinality */ - private LongSet newSet(int cardinality) { - if (cardinality < 8) - return new LongArraySet(cardinality); - else - return new LongOpenHashSet(cardinality); - } - -} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java index 1834c08f..217fe6cf 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/QueryResponse.java @@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.query; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; +import javax.annotation.Nullable; import java.util.HashSet; import java.util.List; import java.util.Set; @@ -10,7 +11,7 @@ public record QueryResponse(SearchSpecification specs, List results, List searchTermsHuman, List problems, - String domain) + @Nullable String domain) { public Set getAllKeywords() { return new HashSet<>(specs.query.searchTermsInclude); diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchPhraseConstraint.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchPhraseConstraint.java new file mode 100644 index 00000000..3a33c7e6 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchPhraseConstraint.java @@ -0,0 +1,85 @@ +package nu.marginalia.api.searchquery.model.query; + +import nu.marginalia.language.WordPatterns; + +import java.util.ArrayList; +import java.util.List; + +public sealed interface SearchPhraseConstraint { + + record Mandatory(List terms) implements SearchPhraseConstraint { + public Mandatory(String... terms) { + this(List.of(terms)); + } + } + + record Optional(List terms) implements SearchPhraseConstraint { + public Optional(String... terms) { + this(List.of(terms)); + } + } + + record Full(List terms) implements SearchPhraseConstraint { + public Full(String... terms) { + this(List.of(terms)); + } + } + + List terms(); + default int size() { + return terms().size(); + } + + static SearchPhraseConstraint mandatory(String... terms) { + return new Mandatory(trimStopWords(terms)); + } + static SearchPhraseConstraint mandatory(List terms) { + return new Mandatory(trimStopWords(terms)); + } + static SearchPhraseConstraint optional(String... terms) { + return new Optional(trimStopWords(terms)); + } + static SearchPhraseConstraint optional(List terms) { + return new Optional(trimStopWords(terms)); + } + static SearchPhraseConstraint full(String... terms) { + return new Full(trimStopWords(terms)); + } + static SearchPhraseConstraint full(List terms) { + return new Full(trimStopWords(terms)); + } + + + private static List trimStopWords(List terms) { + List ret = new ArrayList<>(terms.size()); + for (var term : terms) { + if (WordPatterns.isStopWord(term)) { + ret.add(""); + } else { + ret.add(term); + } + } + return List.copyOf(ret); + } + + private static List trimStopWords(String... terms) { + List ret = new ArrayList<>(terms.length); + for (var term : terms) { + if (WordPatterns.isStopWord(term)) { + ret.add(""); + } else { + ret.add(term); + } + } + + while (!ret.isEmpty() && "".equals(ret.getFirst())) { + ret.removeFirst(); + } + while (!ret.isEmpty() && "".equals(ret.getLast())) { + ret.removeLast(); + } + + return List.copyOf(ret); + } + +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java index ffe02868..da7a58ed 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchQuery.java @@ -31,18 +31,22 @@ public class SearchQuery { public final List searchTermsPriority; /** Terms that we require to be in the same sentence */ - public final List> searchTermCoherences; + public final List phraseConstraints; @Deprecated // why does this exist? private double value = 0; + public static SearchQueryBuilder builder() { + return new SearchQueryBuilder(); + } + public SearchQuery() { this.compiledQuery = ""; this.searchTermsInclude = new ArrayList<>(); this.searchTermsExclude = new ArrayList<>(); this.searchTermsAdvice = new ArrayList<>(); this.searchTermsPriority = new ArrayList<>(); - this.searchTermCoherences = new ArrayList<>(); + this.phraseConstraints = new ArrayList<>(); } public SearchQuery(String compiledQuery, @@ -50,13 +54,13 @@ public class SearchQuery { List searchTermsExclude, List searchTermsAdvice, List searchTermsPriority, - List> searchTermCoherences) { + List phraseConstraints) { this.compiledQuery = compiledQuery; this.searchTermsInclude = searchTermsInclude; this.searchTermsExclude = searchTermsExclude; this.searchTermsAdvice = searchTermsAdvice; this.searchTermsPriority = searchTermsPriority; - this.searchTermCoherences = searchTermCoherences; + this.phraseConstraints = phraseConstraints; } @Deprecated // why does this exist? @@ -76,10 +80,62 @@ public class SearchQuery { if (!searchTermsExclude.isEmpty()) sb.append("exclude=").append(searchTermsExclude.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsAdvice.isEmpty()) sb.append("advice=").append(searchTermsAdvice.stream().collect(Collectors.joining(",", "[", "] "))); if (!searchTermsPriority.isEmpty()) sb.append("priority=").append(searchTermsPriority.stream().collect(Collectors.joining(",", "[", "] "))); - if (!searchTermCoherences.isEmpty()) sb.append("coherences=").append(searchTermCoherences.stream().map(coh->coh.stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", "))); + if (!phraseConstraints.isEmpty()) sb.append("phraseConstraints=").append(phraseConstraints.stream().map(coh->coh.terms().stream().collect(Collectors.joining(",", "[", "] "))).collect(Collectors.joining(", "))); return sb.toString(); } + public static class SearchQueryBuilder { + private String compiledQuery; + public final List searchTermsInclude = new ArrayList<>(); + public final List searchTermsExclude = new ArrayList<>(); + public final List searchTermsAdvice = new ArrayList<>(); + public final List searchTermsPriority = new ArrayList<>(); + public final List searchPhraseConstraints = new ArrayList<>(); + private SearchQueryBuilder() { + } + + public SearchQueryBuilder compiledQuery(String query) { + this.compiledQuery = query; + return this; + } + + public SearchQueryBuilder include(String... terms) { + searchTermsInclude.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder exclude(String... terms) { + searchTermsExclude.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder advice(String... terms) { + searchTermsAdvice.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder priority(String... terms) { + searchTermsPriority.addAll(List.of(terms)); + return this; + } + + public SearchQueryBuilder phraseConstraint(SearchPhraseConstraint constraint) { + searchPhraseConstraints.add(constraint); + return this; + } + + public SearchQuery build() { + return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchPhraseConstraints); + } + + /** If there are no ranking terms, promote the advice terms to ranking terms */ + public void promoteNonRankingTerms() { + if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { + searchTermsInclude.addAll(searchTermsAdvice); + searchTermsAdvice.clear(); + } + } + } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java index bbb5b7ae..78afdd1f 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/query/SearchSpecification.java @@ -19,10 +19,14 @@ public class SearchSpecification { public final String humanQuery; - public final SpecificationLimit quality; - public final SpecificationLimit year; - public final SpecificationLimit size; - public final SpecificationLimit rank; + @Builder.Default + public final SpecificationLimit quality = SpecificationLimit.none(); + @Builder.Default + public final SpecificationLimit year = SpecificationLimit.none(); + @Builder.Default + public final SpecificationLimit size = SpecificationLimit.none(); + @Builder.Default + public final SpecificationLimit rank = SpecificationLimit.none(); public final QueryLimits queryLimits; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java index 0522e7bc..8a9b690b 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/DecoratedSearchResultItem.java @@ -34,6 +34,8 @@ public class DecoratedSearchResultItem implements Comparable keywordScores() { return rawIndexResult.getKeywordScores(); @@ -72,6 +71,7 @@ public class DecoratedSearchResultItem implements Comparable { /** How did the subqueries match against the document ? */ public final List keywordScores; - /** How many other potential results existed in the same domain */ - public int resultsFromDomain; - public boolean hasPrioTerm; + public long bestPositions; + + public DebugRankingFactors debugRankingFactors; + public SearchResultItem(long combinedId, long encodedDocMetadata, int htmlFeatures, - boolean hasPrioTerm) { + double score, + long bestPositions) { this.combinedId = combinedId; this.encodedDocMetadata = encodedDocMetadata; + this.bestPositions = bestPositions; this.keywordScores = new ArrayList<>(); this.htmlFeatures = htmlFeatures; - this.hasPrioTerm = hasPrioTerm; + this.scoreValue = score; } @@ -84,7 +88,6 @@ public class SearchResultItem implements Comparable { @Override public int compareTo(@NotNull SearchResultItem o) { - // this looks like a bug, but we actually want this in a reversed order int diff = Double.compare(getScore(), o.getScore()); if (diff != 0) return diff; diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java index 212b2302..b04d65df 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultKeywordScore.java @@ -1,40 +1,32 @@ package nu.marginalia.api.searchquery.model.results; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import java.util.Objects; public final class SearchResultKeywordScore { public final long termId; public final String keyword; - private final long encodedWordMetadata; + public byte flags; + public int positionCount; public SearchResultKeywordScore(String keyword, long termId, - long encodedWordMetadata) { + byte flags, + int positionCount) { this.termId = termId; this.keyword = keyword; - this.encodedWordMetadata = encodedWordMetadata; } public boolean hasTermFlag(WordFlags flag) { - return WordMetadata.hasFlags(encodedWordMetadata, flag.asBit()); + return (flags & flag.asBit()) != 0; } - public long positions() { - return WordMetadata.decodePositions(encodedWordMetadata); - } - public boolean isKeywordSpecial() { return keyword.contains(":") || hasTermFlag(WordFlags.Synthetic); } - public long encodedWordMetadata() { - return encodedWordMetadata; - } - @Override public boolean equals(Object obj) { if (obj == this) return true; @@ -51,8 +43,7 @@ public final class SearchResultKeywordScore { @Override public String toString() { return "SearchResultKeywordScore[" + - "keyword=" + keyword + ", " + - "encodedWordMetadata=" + new WordMetadata(encodedWordMetadata) + ']'; + "keyword=" + keyword + ']'; } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java deleted file mode 100644 index 09468162..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/SearchResultSet.java +++ /dev/null @@ -1,22 +0,0 @@ -package nu.marginalia.api.searchquery.model.results; - -import lombok.AllArgsConstructor; -import lombok.Getter; -import lombok.ToString; - -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; - -@AllArgsConstructor @Getter @ToString -public class SearchResultSet { - public SearchResultSet() { - results = new ArrayList<>(); - } - - public List results; - public int size() { - return results.size(); - } - -} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactor.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactor.java new file mode 100644 index 00000000..9eb2f6c6 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactor.java @@ -0,0 +1,4 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +public record DebugFactor(String factor, String value) { +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactorGroup.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactorGroup.java new file mode 100644 index 00000000..245cdb8c --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugFactorGroup.java @@ -0,0 +1,5 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +import java.util.List; + +public record DebugFactorGroup(String name, List factors) {} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugRankingFactors.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugRankingFactors.java new file mode 100644 index 00000000..fecab104 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugRankingFactors.java @@ -0,0 +1,39 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +import it.unimi.dsi.fastutil.ints.IntIterator; + +import java.util.ArrayList; +import java.util.List; +import java.util.StringJoiner; + +/** Utility for capturing debug information about ranking factors */ +public class DebugRankingFactors { + private final List documentFactors = new ArrayList<>(); + private final List termFactors = new ArrayList<>(); + + public DebugRankingFactors() {} + + public void addDocumentFactor(String factor, String value) { + documentFactors.add(new DebugFactor(factor, value)); + } + + public void addTermFactor(long termId, String factor, String value) { + termFactors.add(new DebugTermFactor(termId, null, factor, value)); + } + public void addTermFactor(long termId, String factor, IntIterator sequenceIter) { + if (!sequenceIter.hasNext()) return; + + StringJoiner joiner = new StringJoiner(","); + while (sequenceIter.hasNext()) { + joiner.add(String.valueOf(sequenceIter.nextInt())); + } + termFactors.add(new DebugTermFactor(termId, null, factor, joiner.toString())); + } + + public List getDocumentFactors() { + return documentFactors; + } + public List getTermFactors() { + return termFactors; + } +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactor.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactor.java new file mode 100644 index 00000000..84b944f3 --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactor.java @@ -0,0 +1,4 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +public record DebugTermFactor(long termId, String term, String factor, String value) { +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactorGroup.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactorGroup.java new file mode 100644 index 00000000..303b7eec --- /dev/null +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/DebugTermFactorGroup.java @@ -0,0 +1,6 @@ +package nu.marginalia.api.searchquery.model.results.debug; + +import java.util.List; + +public record DebugTermFactorGroup(String term, long termId, List factorList) { +} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java index c94200e2..e4bca962 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingDetails.java @@ -1,6 +1,9 @@ package nu.marginalia.api.searchquery.model.results.debug; -public record ResultRankingDetails(ResultRankingInputs inputs, ResultRankingOutputs outputs) +import java.util.List; + +public record ResultRankingDetails(List docFactorGroups, + List termFactorGroups) { } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java deleted file mode 100644 index 86169416..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingInputs.java +++ /dev/null @@ -1,5 +0,0 @@ -package nu.marginalia.api.searchquery.model.results.debug; - -import java.util.List; - -public record ResultRankingInputs(int rank, int asl, int quality, int size, int topology, int year, List flags) {} diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java deleted file mode 100644 index bd4b943d..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/results/debug/ResultRankingOutputs.java +++ /dev/null @@ -1,17 +0,0 @@ -package nu.marginalia.api.searchquery.model.results.debug; - -public record ResultRankingOutputs(double averageSentenceLengthPenalty, - double qualityPenalty, - double rankingBonus, - double topologyBonus, - double documentLengthPenalty, - double temporalBias, - double flagsPenalty, - double overallPart, - double tcfOverlap, - double tcfJaccard, - double bM25F, - double bM25N, - double bM25P) -{ -} diff --git a/code/functions/search-query/api/src/main/protobuf/query-api.proto b/code/functions/search-query/api/src/main/protobuf/query-api.proto index eb4e48ba..1504d46f 100644 --- a/code/functions/search-query/api/src/main/protobuf/query-api.proto +++ b/code/functions/search-query/api/src/main/protobuf/query-api.proto @@ -93,22 +93,30 @@ message RpcDecoratedResultItem { double rankingScore = 11; // The ranking score of this search result item, lower is better int64 bestPositions = 12; RpcResultRankingDetails rankingDetails = 13; // optional, only present if exportDebugData is true in RpcResultRankingParameters + int32 resultsFromDomain = 14; } /** A raw index-service view of a search result */ message RpcRawResultItem { int64 combinedId = 1; // raw ID with bit-encoded ranking information still present - int32 resultsFromDomain = 2; // number of other results from the same domain int64 encodedDocMetadata = 3; // bit encoded document metadata int32 htmlFeatures = 4; // bitmask encoding features of the document repeated RpcResultKeywordScore keywordScores = 5; bool hasPriorityTerms = 6; // true if this word is important to the document + MATCH_TYPE matchType = 7; // the type of match this result represents + + enum MATCH_TYPE { + FLAGS = 0; + PROXIMITY = 1; + PHRASE = 2; + }; } /* Information about how well a keyword matches a query */ message RpcResultKeywordScore { string keyword = 1; // the keyword - int64 encodedWordMetadata = 2; // bit encoded word metadata + int32 flags = 2; + int32 positions = 3; } /* Query execution parameters */ @@ -119,30 +127,32 @@ message RpcQueryLimits { int32 fetchSize = 4; // Size of the fetch buffer in the index service } +/** Parameters for the result ranking function */ message RpcResultRankingParameters { - double fullK = 1; // BM25 parameter - double fullB = 2; // BM25 parameter - double prioK = 3; // BM25 parameter - double prioB = 4; // BM25 parameter + double bm25K = 1; // BM25 parameter + double bm25B = 2; // BM25 parameter + int32 shortDocumentThreshold = 5; double shortDocumentPenalty = 6; double domainRankBonus = 7; double qualityPenalty = 8; int32 shortSentenceThreshold = 9; double shortSentencePenalty = 10; - double bm25FullWeight = 11; - double bm25NgramWeight = 12; - double bm25PrioWeight = 13; - double tcfOverlapWeight = 14; - double tcfJaccardWeight = 15; + double bm25Weight = 11; + // -- 12 unused -- + double tcfFirstPositionWeight = 13; + double tcfVerbatimWeight = 14; + double tcfProximityWeight = 15; RpcTemporalBias temporalBias = 16; double temporalBiasWeight = 17; + bool exportDebugData = 18; + } message RpcResultRankingDetails { - RpcResultRankingInputs inputs = 1; - RpcResultRankingOutputs output = 2; + RpcResultDocumentRankingOutputs documentOutputs = 1; + RpcResultTermRankingOutputs termOutputs = 2; } message RpcResultRankingInputs { @@ -155,20 +165,17 @@ message RpcResultRankingInputs { repeated string flags = 7; } -message RpcResultRankingOutputs { - double averageSentenceLengthPenalty = 1; - double qualityPenalty = 2; - double rankingBonus = 3; - double topologyBonus = 4; - double documentLengthPenalty = 5; - double temporalBias = 6; - double flagsPenalty = 7; - double overallPart = 8; - double tcfOverlap = 9; - double tcfJaccard = 10; - double bM25F = 11; - double bM25N = 12; - double bM25P = 13; +/** Summary of the output of the ranking function */ +message RpcResultDocumentRankingOutputs { + repeated string factor = 1; + repeated string value = 2; +} + +message RpcResultTermRankingOutputs { + repeated int64 termId = 1; + repeated string term = 2; + repeated string factor = 3; + repeated string value = 4; } /* Defines a single subquery */ @@ -177,11 +184,18 @@ message RpcQuery { repeated string exclude = 2; // These terms must be absent repeated string advice = 3; // These terms must be present, but do not affect ranking repeated string priority = 4; // These terms are not mandatory, but affect ranking positively if they are present - repeated RpcCoherences coherences = 5; // Groups of terms that must exist in proximity of each other + repeated RpcPhrases phrases = 5; // Groups of terms that must exist in proximity of each other string compiledQuery = 6; // Compiled query in infix notation } -/* Defines a group of search terms that must exist in close proximity within the document */ -message RpcCoherences { - repeated string coherences = 1; +/* Defines a group of search terms that must exist in the the specified order within the document */ +message RpcPhrases { + repeated string terms = 1; + TYPE type = 2; + + enum TYPE { + OPTIONAL = 0; + MANDATORY = 1; + FULL = 2; + }; } diff --git a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java index 47983820..e7b1ce5d 100644 --- a/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java +++ b/code/functions/search-query/api/test/nu/marginalia/api/searchquery/model/compiled/CompiledQueryParserTest.java @@ -1,10 +1,11 @@ package nu.marginalia.api.searchquery.model.compiled; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; import org.junit.jupiter.api.Test; import java.util.List; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class CompiledQueryParserTest { @@ -22,6 +23,21 @@ class CompiledQueryParserTest { assertEquals(w(q, "foo"), q.root); } + @Test + public void testCohen() { + CompiledQuery q = CompiledQueryParser.parse("( tube brief of elaboration | brief_elaboration_of_a_tube )"); + int val = CompiledQueryAggregates.intMaxMinAggregate(q, s -> + switch (s) { + case "brief" -> 3; + case "tube" -> 2; + case "of" -> 1; + default -> 0; + }); + assertEquals(0, val); + + System.out.println(q.stream().toList()); + } + @Test public void testAndTwoWords() { CompiledQuery q = CompiledQueryParser.parse("foo bar"); diff --git a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java index e93f715c..b7b64590 100644 --- a/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java +++ b/code/functions/search-query/api/test/nu/marginalia/index/client/IndexProtobufCodecTest.java @@ -1,6 +1,7 @@ package nu.marginalia.index.client; import nu.marginalia.api.searchquery.IndexProtobufCodec; +import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint; import nu.marginalia.api.searchquery.model.query.SearchQuery; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.index.query.limit.QueryLimits; @@ -10,7 +11,7 @@ import org.junit.jupiter.api.Test; import java.util.List; import java.util.function.Function; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; class IndexProtobufCodecTest { @Test @@ -41,7 +42,9 @@ class IndexProtobufCodecTest { List.of("c", "d"), List.of("e", "f"), List.of("g", "h"), - List.of(List.of("i", "j"), List.of("k")) + List.of( + SearchPhraseConstraint.mandatory(List.of("i", "j")), + SearchPhraseConstraint.optional(List.of("k"))) ), s -> IndexProtobufCodec.convertRpcQuery(IndexProtobufCodec.convertRpcQuery(s)) ); diff --git a/code/functions/search-query/build.gradle b/code/functions/search-query/build.gradle index deddc7c9..a2d10a59 100644 --- a/code/functions/search-query/build.gradle +++ b/code/functions/search-query/build.gradle @@ -31,7 +31,7 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') - implementation project(':code:features-convert:keyword-extraction') + implementation project(':code:processes:converting-process:ft-keyword-extraction') implementation libs.bundles.slf4j diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryFactory.java similarity index 61% rename from code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java rename to code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryFactory.java index 908eb2e2..12e98fba 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/svc/QueryFactory.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryFactory.java @@ -1,18 +1,15 @@ -package nu.marginalia.functions.searchquery.svc; +package nu.marginalia.functions.searchquery; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.*; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; +import nu.marginalia.functions.searchquery.query_parser.QueryParser; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.WordPatterns; -import nu.marginalia.api.searchquery.model.query.QueryParams; -import nu.marginalia.api.searchquery.model.query.ProcessedQuery; -import nu.marginalia.functions.searchquery.query_parser.QueryParser; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,11 +53,7 @@ public class QueryFactory { basicQuery.clear(); } - List searchTermsExclude = new ArrayList<>(); - List searchTermsInclude = new ArrayList<>(); - List searchTermsAdvice = new ArrayList<>(); - List searchTermsPriority = new ArrayList<>(); - List> searchTermCoherences = new ArrayList<>(); + SearchQuery.SearchQueryBuilder queryBuilder = SearchQuery.builder(); SpecificationLimit qualityLimit = SpecificationLimit.none(); SpecificationLimit year = SpecificationLimit.none(); @@ -78,58 +71,50 @@ public class QueryFactory { String[] parts = StringUtils.split(str, '_'); - // Checking for stop words here is a bit of a stop-gap to fix the issue of stop words being - // required in the query (which is a problem because they are not indexed). How to do this - // in a clean way is a bit of an open problem that may not get resolved until query-parsing is - // improved. + if (parts.length > 1) { + // Require that the terms appear in sequence + queryBuilder.phraseConstraint(SearchPhraseConstraint.mandatory(parts)); + + // Construct a regular query from the parts in the quoted string + queryBuilder.include(parts); - if (parts.length > 1 && !anyPartIsStopWord(parts)) { // Prefer that the actual n-gram is present - searchTermsAdvice.add(str); - - // Require that the terms appear in the same sentence - searchTermCoherences.add(Arrays.asList(parts)); - - // Require that each term exists in the document - // (needed for ranking) - searchTermsInclude.addAll(Arrays.asList(parts)); + queryBuilder.priority(str); } else { - searchTermsInclude.add(str); + // If the quoted word is a single word, we don't need to do more than include it in the search + queryBuilder.include(str); } } + case QueryToken.LiteralTerm(String str, String displayStr) -> { analyzeSearchTerm(problems, str, displayStr); searchTermsHuman.addAll(Arrays.asList(displayStr.split("\\s+"))); - searchTermsInclude.add(str); + queryBuilder.include(str); } - - case QueryToken.ExcludeTerm(String str, String displayStr) -> searchTermsExclude.add(str); - case QueryToken.PriorityTerm(String str, String displayStr) -> searchTermsPriority.add(str); + case QueryToken.ExcludeTerm(String str, String displayStr) -> queryBuilder.exclude(str); + case QueryToken.PriorityTerm(String str, String displayStr) -> queryBuilder.priority(str); case QueryToken.AdviceTerm(String str, String displayStr) -> { - searchTermsAdvice.add(str); + queryBuilder.advice(str); if (str.toLowerCase().startsWith("site:")) { domain = str.substring("site:".length()); } } - case QueryToken.YearTerm(String str) -> year = parseSpecificationLimit(str); - case QueryToken.SizeTerm(String str) -> size = parseSpecificationLimit(str); - case QueryToken.RankTerm(String str) -> rank = parseSpecificationLimit(str); - case QueryToken.QualityTerm(String str) -> qualityLimit = parseSpecificationLimit(str); + case QueryToken.YearTerm(SpecificationLimit limit, String displayStr) -> year = limit; + case QueryToken.SizeTerm(SpecificationLimit limit, String displayStr) -> size = limit; + case QueryToken.RankTerm(SpecificationLimit limit, String displayStr) -> rank = limit; + case QueryToken.QualityTerm(SpecificationLimit limit, String displayStr) -> qualityLimit = limit; case QueryToken.QsTerm(String str) -> queryStrategy = parseQueryStrategy(str); default -> {} } } - if (searchTermsInclude.isEmpty() && !searchTermsAdvice.isEmpty()) { - searchTermsInclude.addAll(searchTermsAdvice); - searchTermsAdvice.clear(); - } + queryBuilder.promoteNonRankingTerms(); List domainIds = params.domainIds(); @@ -139,20 +124,21 @@ public class QueryFactory { limits = limits.forSingleDomain(); } - var expansion = queryExpansion.expandQuery(searchTermsInclude); - searchTermCoherences.addAll(expansion.extraCoherences()); + var expansion = queryExpansion.expandQuery(queryBuilder.searchTermsInclude); - var searchQuery = new SearchQuery( - expansion.compiledQuery(), - searchTermsInclude, - searchTermsExclude, - searchTermsAdvice, - searchTermsPriority, - searchTermCoherences - ); + // Query expansion may produce suggestions for phrase constraints, + // add these to the query + for (var coh : expansion.optionalPharseConstraints()) { + queryBuilder.phraseConstraint(SearchPhraseConstraint.optional(coh)); + } + + // add a pseudo-constraint for the full query + queryBuilder.phraseConstraint(SearchPhraseConstraint.full(expansion.fullPhraseConstraint())); + + queryBuilder.compiledQuery(expansion.compiledQuery()); var specsBuilder = SearchSpecification.builder() - .query(searchQuery) + .query(queryBuilder.build()) .humanQuery(query) .quality(qualityLimit) .year(year) @@ -183,20 +169,7 @@ public class QueryFactory { problems.add("Search term \"" + displayStr + "\" too long"); } } - private SpecificationLimit parseSpecificationLimit(String str) { - int startChar = str.charAt(0); - int val = Integer.parseInt(str.substring(1)); - if (startChar == '=') { - return SpecificationLimit.equals(val); - } else if (startChar == '<') { - return SpecificationLimit.lessThan(val); - } else if (startChar == '>') { - return SpecificationLimit.greaterThan(val); - } else { - return SpecificationLimit.none(); - } - } private QueryStrategy parseQueryStrategy(String str) { return switch (str.toUpperCase()) { @@ -211,14 +184,4 @@ public class QueryFactory { default -> QueryStrategy.AUTO; }; } - - - private boolean anyPartIsStopWord(String[] parts) { - for (String part : parts) { - if (WordPatterns.isStopWord(part)) { - return true; - } - } - return false; - } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java index 98f7fb6f..e4bac6e2 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/QueryGRPCService.java @@ -1,19 +1,16 @@ package nu.marginalia.functions.searchquery; +import com.google.common.collect.Lists; import com.google.inject.Inject; import com.google.inject.Singleton; import io.grpc.stub.StreamObserver; import io.prometheus.client.Histogram; -import lombok.SneakyThrows; import nu.marginalia.api.searchquery.*; import nu.marginalia.api.searchquery.model.query.ProcessedQuery; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.db.DomainBlacklist; import nu.marginalia.index.api.IndexClient; -import nu.marginalia.functions.searchquery.svc.QueryFactory; import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; -import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -33,18 +30,18 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { private final QueryFactory queryFactory; - private final DomainBlacklist blacklist; private final IndexClient indexClient; + @Inject public QueryGRPCService(QueryFactory queryFactory, - DomainBlacklist blacklist, IndexClient indexClient) { this.queryFactory = queryFactory; - this.blacklist = blacklist; this.indexClient = indexClient; } + /** GRPC endpoint that parses a query, delegates it to the index partitions, and then collects the results. + */ public void query(RpcQsQuery request, StreamObserver responseObserver) { try { @@ -55,16 +52,20 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { var params = QueryProtobufCodec.convertRequest(request); var query = queryFactory.createQuery(params, ResultRankingParameters.sensibleDefaults()); - RpcIndexQuery indexRequest = QueryProtobufCodec.convertQuery(request, query); - List bestItems = executeQueries(indexRequest, request.getQueryLimits().getResultsTotal()); + var indexRequest = QueryProtobufCodec.convertQuery(request, query); + // Execute the query on the index partitions + List bestItems = indexClient.executeQueries(indexRequest); + + // Convert results to response and send it back var responseBuilder = RpcQsResponse.newBuilder() .addAllResults(bestItems) .setSpecs(indexRequest) .addAllSearchTermsHuman(query.searchTermsHuman); - if (query.domain != null) + if (query.domain != null) { responseBuilder.setDomain(query.domain); + } responseObserver.onNext(responseBuilder.build()); responseObserver.onCompleted(); @@ -75,44 +76,19 @@ public class QueryGRPCService extends QueryApiGrpc.QueryApiImplBase { } } - private static final Comparator comparator = - Comparator.comparing(RpcDecoratedResultItem::getRankingScore); - - - private boolean isBlacklisted(RpcDecoratedResultItem item) { - return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId())); - } + public record DetailedDirectResult(ProcessedQuery processedQuery, + List result) {} + /** Local query execution, without GRPC. */ public DetailedDirectResult executeDirect( String originalQuery, QueryParams params, - ResultRankingParameters rankingParameters, - int count) { + ResultRankingParameters rankingParameters) { var query = queryFactory.createQuery(params, rankingParameters); + var items = indexClient.executeQueries(QueryProtobufCodec.convertQuery(originalQuery, query)); - var items = executeQueries( - QueryProtobufCodec.convertQuery(originalQuery, query), - count) - .stream().map(QueryProtobufCodec::convertQueryResult) - .toList(); - - return new DetailedDirectResult(query, items); - } - - public record DetailedDirectResult(ProcessedQuery processedQuery, - List result) {} - - @SneakyThrows - List executeQueries(RpcIndexQuery indexRequest, int totalSize) { - var results = indexClient.executeQueries(indexRequest); - - results.sort(comparator); - results.removeIf(this::isBlacklisted); - if (results.size() > totalSize) { - results = results.subList(0, totalSize); - } - return results; + return new DetailedDirectResult(query, Lists.transform(items, QueryProtobufCodec::convertQueryResult)); } } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java index 2af0b586..b8d1f062 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryExpansion.java @@ -5,7 +5,6 @@ import com.google.inject.Inject; import nu.marginalia.functions.searchquery.query_parser.model.QWord; import nu.marginalia.functions.searchquery.query_parser.model.QWordGraph; import nu.marginalia.functions.searchquery.query_parser.model.QWordPathsRenderer; -import nu.marginalia.language.WordPatterns; import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; @@ -45,11 +44,17 @@ public class QueryExpansion { strategy.expand(graph); } - List> coherences = createSegments(graph); + List> optionalPhraseConstraints = createSegments(graph); + + // also create a segmentation that is just the entire query + List fullPhraseConstraint = new ArrayList<> (); + for (var qw : graph) { + fullPhraseConstraint.add(qw.word()); + } var compiled = QWordPathsRenderer.render(graph); - return new Expansion(compiled, coherences); + return new Expansion(compiled, optionalPhraseConstraints, fullPhraseConstraint); } private static final Pattern dashPattern = Pattern.compile("-"); @@ -131,6 +136,10 @@ public class QueryExpansion { nodes.add(qw); } + if (nodes.size() <= 1) { + return List.of(); + } + String[] words = nodes.stream().map(QWord::stemmed).toArray(String[]::new); // Grab all segments @@ -141,15 +150,11 @@ public class QueryExpansion { } allSegments.sort(Comparator.comparing(NgramLexicon.SentenceSegment::start)); - if (allSegments.isEmpty()) { - return List.of(); - } + Set> constraints = new HashSet<>(); Set bestSegmentation = findBestSegmentation(allSegments); - List> coherences = new ArrayList<>(); - for (var segment : bestSegmentation) { int start = segment.start(); @@ -159,14 +164,14 @@ public class QueryExpansion { for (int i = start; i < end; i++) { components.add(nodes.get(i).word()); } - coherences.add(components); + constraints.add(components); // Create an n-gram search term for the segment String word = String.join("_", components); graph.addVariantForSpan(nodes.get(start), nodes.get(end - 1), word); } - return coherences; + return new ArrayList<>(constraints); } private Set findBestSegmentation(List allSegments) { @@ -209,5 +214,5 @@ public class QueryExpansion { void expand(QWordGraph graph); } - public record Expansion(String compiledQuery, List> extraCoherences) {} + public record Expansion(String compiledQuery, List> optionalPharseConstraints, List fullPhraseConstraint) {} } diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java index 3f92a594..0cd358c2 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryParser.java @@ -1,22 +1,24 @@ package nu.marginalia.functions.searchquery.query_parser; import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; +import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.encoding.AsciiFlattener; import nu.marginalia.util.transform_list.TransformList; +import java.util.ArrayList; import java.util.List; +import java.util.regex.Pattern; public class QueryParser { - private final QueryTokenizer tokenizer = new QueryTokenizer(); - public List parse(String query) { - List basicTokens = tokenizer.tokenizeQuery(query); + List basicTokens = tokenizeQuery(query); TransformList list = new TransformList<>(basicTokens); - list.transformEach(QueryParser::handleQuoteTokens); list.transformEach(QueryParser::trimLiterals); + list.transformEach(QueryParser::handleQuoteTokens); list.transformEachPair(QueryParser::createNegatedTerms); list.transformEachPair(QueryParser::createPriorityTerms); list.transformEach(QueryParser::handleSpecialOperations); @@ -26,6 +28,96 @@ public class QueryParser { return list.getBackingList(); } + private static final Pattern noisePattern = Pattern.compile("[,\\s]"); + + public List tokenizeQuery(String rawQuery) { + List tokens = new ArrayList<>(); + + String query = AsciiFlattener.flattenUnicode(rawQuery); + query = noisePattern.matcher(query).replaceAll(" "); + + int chr = -1; + int parenDepth = 0; + for (int i = 0; i < query.length(); i++) { + chr = query.charAt(i); + + if ('(' == chr) { + parenDepth++; + tokens.add(new QueryToken.LParen()); + } + else if (')' == chr) { + parenDepth--; + tokens.add(new QueryToken.RParen()); + } + else if ('"' == chr) { + int end = query.indexOf('"', i+1); + + if (end == -1) { + end = query.length(); + } + + tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); + + i = end; + } + else if ('-' == chr) { + tokens.add(new QueryToken.Minus()); + } + else if ('?' == chr) { + tokens.add(new QueryToken.QMark()); + } + else if (!Character.isSpaceChar(chr)) { + + // search for the end of the term + int end = i+1; + int prevC = -1; + int c = -1; + for (; end < query.length(); end++) { + prevC = c; + c = query.charAt(end); + + if (prevC == '\\') + continue; + if (c == ' ') + break; + + // special case to deal with possible RPAREN token at the end, + // but we don't want to break if it's likely part of the search term + if (c == '(' && prevC != ')' && parenDepth > 0) + break; + } + + String displayStr = query.substring(i, end); + String str = trimEscape(displayStr.toLowerCase()); + + tokens.add(new QueryToken.LiteralTerm(str, displayStr)); + + i = end-1; + } + } + return tokens; + } + + private String trimEscape(String str) { + if (!str.contains("\\")) { + return str; + } + + StringBuilder sb = new StringBuilder(str.length()); + for (int j = 0; j < str.length(); j++) { + char c = str.charAt(j); + if (c == '\\') { + if (j + 1 < str.length()) { + sb.append(str.charAt(j + 1)); + j++; + } + } else { + sb.append(c); + } + } + return sb.toString(); + } + private static void normalizeDomainName(TransformList.Entity entity) { var t = entity.value(); @@ -60,10 +152,22 @@ public class QueryParser { if (str.isBlank()) return; - if (str.endsWith(":") || str.endsWith(".")) { + // Remove trailing punctuation + int lastChar = str.charAt(str.length() - 1); + if (":.,!?$'".indexOf(lastChar) >= 0) entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 1), lt.displayStr())); - } + // Remove term elements that aren't indexed by the search engine + if (str.endsWith("'s")) + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); + if (str.endsWith("()")) + entity.replace(new QueryToken.LiteralTerm(str.substring(0, str.length() - 2), lt.displayStr())); + if (str.startsWith("$")) + entity.replace(new QueryToken.LiteralTerm(str.substring(1), lt.displayStr())); + + if (entity.isBlank()) { + entity.remove(); + } } private static void createNegatedTerms(TransformList.Entity first, TransformList.Entity second) { @@ -104,15 +208,19 @@ public class QueryParser { String str = t.str(); if (str.startsWith("q") && str.matches("q[=><]\\d+")) { - entity.replace(new QueryToken.QualityTerm(str.substring(1))); + var limit = parseSpecificationLimit(str.substring(1)); + entity.replace(new QueryToken.QualityTerm(limit, str)); } else if (str.startsWith("near:")) { entity.replace(new QueryToken.NearTerm(str.substring(5))); } else if (str.startsWith("year") && str.matches("year[=><]\\d{4}")) { - entity.replace(new QueryToken.YearTerm(str.substring(4))); + var limit = parseSpecificationLimit(str.substring(4)); + entity.replace(new QueryToken.YearTerm(limit, str)); } else if (str.startsWith("size") && str.matches("size[=><]\\d+")) { - entity.replace(new QueryToken.SizeTerm(str.substring(4))); + var limit = parseSpecificationLimit(str.substring(4)); + entity.replace(new QueryToken.SizeTerm(limit, str)); } else if (str.startsWith("rank") && str.matches("rank[=><]\\d+")) { - entity.replace(new QueryToken.RankTerm(str.substring(4))); + var limit = parseSpecificationLimit(str.substring(4)); + entity.replace(new QueryToken.RankTerm(limit, str)); } else if (str.startsWith("qs=")) { entity.replace(new QueryToken.QsTerm(str.substring(3))); } else if (str.contains(":")) { @@ -120,6 +228,21 @@ public class QueryParser { } } + private static SpecificationLimit parseSpecificationLimit(String str) { + int startChar = str.charAt(0); + + int val = Integer.parseInt(str.substring(1)); + if (startChar == '=') { + return SpecificationLimit.equals(val); + } else if (startChar == '<') { + return SpecificationLimit.lessThan(val); + } else if (startChar == '>') { + return SpecificationLimit.greaterThan(val); + } else { + return SpecificationLimit.none(); + } + } + private static void handleAdvisoryTerms(TransformList.Entity entity) { var t = entity.value(); if (t instanceof QueryToken.LParen) { diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java deleted file mode 100644 index 80f05808..00000000 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/QueryTokenizer.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.functions.searchquery.query_parser; - -import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; -import nu.marginalia.language.encoding.AsciiFlattener; -import nu.marginalia.language.sentence.SentenceExtractorStringUtils; - -import java.util.ArrayList; -import java.util.List; -import java.util.regex.Pattern; - -public class QueryTokenizer { - private static final Pattern noisePattern = Pattern.compile("[,\\s]"); - - public List tokenizeQuery(String rawQuery) { - List tokens = new ArrayList<>(); - - String query = AsciiFlattener.flattenUnicode(rawQuery); - query = noisePattern.matcher(query).replaceAll(" "); - - for (int i = 0; i < query.length(); i++) { - int chr = query.charAt(i); - - if ('(' == chr) { - tokens.add(new QueryToken.LParen()); - } - else if (')' == chr) { - tokens.add(new QueryToken.RParen()); - } - else if ('"' == chr) { - int end = query.indexOf('"', i+1); - - if (end == -1) { - end = query.length(); - } - - tokens.add(new QueryToken.Quot(query.substring(i + 1, end).toLowerCase())); - - i = end; - } - else if ('-' == chr) { - tokens.add(new QueryToken.Minus()); - } - else if ('?' == chr) { - tokens.add(new QueryToken.QMark()); - } - else if (Character.isSpaceChar(chr)) { - // - } - else { - - int end = i+1; - for (; end < query.length(); end++) { - if (query.charAt(end) == ' ' || query.charAt(end) == ')') - break; - } - - String displayStr = query.substring(i, end); - String str = SentenceExtractorStringUtils.toLowerCaseStripPossessive(displayStr); - - tokens.add(new QueryToken.LiteralTerm(str, displayStr)); - - i = end-1; - } - } - return tokens; - } - - -} diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java index 724ef6a1..d37c8bbb 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/model/QWordGraph.java @@ -248,16 +248,29 @@ public class QWordGraph implements Iterable { @Override public Iterator iterator() { return new Iterator<>() { + QWord next = null; QWord pos = QWord.beg(); @Override public boolean hasNext() { - return !pos.isEnd(); + if (next == null) { + if (pos.isEnd()) { + return false; + } + next = getNextOriginal(pos).getFirst(); + } + + return !next.isEnd(); } @Override public QWord next() { - pos = getNextOriginal(pos).getFirst(); + if (!hasNext()) { + throw new NoSuchElementException(); + } + + pos = next; + next = null; return pos; } }; diff --git a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java index b11fe370..175db074 100644 --- a/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java +++ b/code/functions/search-query/java/nu/marginalia/functions/searchquery/query_parser/token/QueryToken.java @@ -1,6 +1,8 @@ package nu.marginalia.functions.searchquery.query_parser.token; +import nu.marginalia.index.query.limit.SpecificationLimit; + public sealed interface QueryToken { String str(); String displayStr(); @@ -11,25 +13,18 @@ public sealed interface QueryToken { record AdviceTerm(String str, String displayStr) implements QueryToken {} record PriorityTerm(String str, String displayStr) implements QueryToken {} - record QualityTerm(String str) implements QueryToken { - public String displayStr() { - return "q" + str; - } + record QualityTerm(SpecificationLimit limit, String displayStr) implements QueryToken { + public String str() { return displayStr; } + } - record YearTerm(String str) implements QueryToken { - public String displayStr() { - return "year" + str; - } + record YearTerm(SpecificationLimit limit, String displayStr) implements QueryToken { + public String str() { return displayStr; } } - record SizeTerm(String str) implements QueryToken { - public String displayStr() { - return "size" + str; - } + record SizeTerm(SpecificationLimit limit, String displayStr) implements QueryToken { + public String str() { return displayStr; } } - record RankTerm(String str) implements QueryToken { - public String displayStr() { - return "rank" + str; - } + record RankTerm(SpecificationLimit limit, String displayStr) implements QueryToken { + public String str() { return displayStr; } } record NearTerm(String str) implements QueryToken { public String displayStr() { diff --git a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java index 62dd2e0a..a0dc6d7f 100644 --- a/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java +++ b/code/functions/search-query/java/nu/marginalia/util/transform_list/TransformList.java @@ -1,5 +1,7 @@ package nu.marginalia.util.transform_list; +import nu.marginalia.functions.searchquery.query_parser.token.QueryToken; + import java.util.List; import java.util.function.BiConsumer; import java.util.function.Consumer; @@ -30,7 +32,7 @@ import java.util.function.Predicate; * * */ -public class TransformList { +public class TransformList { private final List backingList; public TransformList(List backingList) { @@ -138,6 +140,10 @@ public class TransformList { value = newValue; } + public boolean isBlank() { + return value == null || value.str().isBlank(); + } + public void remove() { action = Action.REMOVE; } diff --git a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java index 4f4fc0b1..e1df546c 100644 --- a/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java +++ b/code/functions/search-query/test/nu/marginalia/functions/searchquery/query_parser/model/QWordGraphTest.java @@ -1,5 +1,6 @@ package nu.marginalia.functions.searchquery.query_parser.model; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import java.util.Comparator; @@ -100,7 +101,8 @@ class QWordGraphTest { assertEquals("q b ( c | d )", graph.compileToQuery()); } - @Test // this test is a bit flaky, the order of the variants is not guaranteed + @Disabled // flaky, the order of the variants is not guaranteed + @Test void testCompile5() { // Construct a graph like diff --git a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java index 1131db90..74345adc 100644 --- a/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java +++ b/code/functions/search-query/test/nu/marginalia/query/svc/QueryFactoryTest.java @@ -1,16 +1,16 @@ package nu.marginalia.query.svc; import nu.marginalia.WmsaHome; +import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.functions.searchquery.QueryFactory; import nu.marginalia.functions.searchquery.query_parser.QueryExpansion; -import nu.marginalia.functions.searchquery.svc.QueryFactory; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; @@ -57,7 +57,12 @@ public class QueryFactoryTest { @Test void qsec10() { - try (var lines = Files.lines(Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"))) { + Path webis = Path.of("/home/vlofgren/Exports/qsec10/webis-qsec-10-training-set/webis-qsec-10-training-set-queries.txt"); + + if (!Files.exists(webis)) + return; + + try (var lines = Files.lines(webis)) { lines.limit(1000).forEach(line -> { String[] parts = line.split("\t"); if (parts.length == 2) { @@ -124,24 +129,6 @@ public class QueryFactoryTest { assertEquals(2000, size.value()); } - @Test - public void testQuotedStopwords() { - { - // the is a stopword, so it should generate an ngram search term - var specs = parseAndGetSpecs("\"the shining\""); - assertEquals("the_shining", specs.query.compiledQuery); - } - - { - // tde isn't a stopword, so we should get the normal behavior - var specs = parseAndGetSpecs("\"tde shining\""); - assertEquals("( shining tde | tde_shining )", specs.query.compiledQuery); - assertEquals(List.of("tde_shining"), specs.query.searchTermsAdvice); - assertEquals(List.of(List.of("tde", "shining")), specs.query.searchTermCoherences); - } - } - - @Test public void testParseQualityEq() { var quality = parseAndGetSpecs("q=2000").quality; @@ -212,12 +199,38 @@ public class QueryFactoryTest { var subquery = parseAndGetSpecs("The"); System.out.println("Time: " + (System.currentTimeMillis() - start)); System.out.println(subquery); - } @Test + } + @Test public void testExpansion6() { long start = System.currentTimeMillis(); var subquery = parseAndGetSpecs("burning the nerves in the neck"); System.out.println("Time: " + (System.currentTimeMillis() - start)); System.out.println(subquery); } + + @Test + public void testExpansion7() { + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("amazing work being done"); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + } + + @Test + public void testExpansion8() { + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("success often consists of"); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + } + + @Test + public void testParsing() { + long start = System.currentTimeMillis(); + var subquery = parseAndGetSpecs("strlen()"); + assertEquals("strlen", subquery.query.compiledQuery); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + System.out.println(subquery); + } } \ No newline at end of file diff --git a/code/index/api/build.gradle b/code/index/api/build.gradle index 1c0873a8..7f958c0e 100644 --- a/code/index/api/build.gradle +++ b/code/index/api/build.gradle @@ -15,6 +15,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:common:service') + implementation project(':code:common:db') implementation project(':code:libraries:message-queue') implementation project(':code:functions:search-query:api') diff --git a/code/index/api/java/nu/marginalia/index/api/IndexClient.java b/code/index/api/java/nu/marginalia/index/api/IndexClient.java index 3a83b5de..e0383a27 100644 --- a/code/index/api/java/nu/marginalia/index/api/IndexClient.java +++ b/code/index/api/java/nu/marginalia/index/api/IndexClient.java @@ -6,6 +6,8 @@ import lombok.SneakyThrows; import nu.marginalia.api.searchquery.IndexApiGrpc; import nu.marginalia.api.searchquery.RpcDecoratedResultItem; import nu.marginalia.api.searchquery.RpcIndexQuery; +import nu.marginalia.db.DomainBlacklistImpl; +import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.service.client.GrpcChannelPoolFactory; import nu.marginalia.service.client.GrpcMultiNodeChannelPool; import nu.marginalia.service.discovery.property.ServiceKey; @@ -14,6 +16,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.ArrayList; +import java.util.Comparator; import java.util.List; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -22,21 +25,34 @@ import java.util.concurrent.Executors; public class IndexClient { private static final Logger logger = LoggerFactory.getLogger(IndexClient.class); private final GrpcMultiNodeChannelPool channelPool; + private final DomainBlacklistImpl blacklist; private static final ExecutorService executor = Executors.newFixedThreadPool(32); + @Inject - public IndexClient(GrpcChannelPoolFactory channelPoolFactory) { + public IndexClient(GrpcChannelPoolFactory channelPoolFactory, DomainBlacklistImpl blacklist) { this.channelPool = channelPoolFactory.createMulti( ServiceKey.forGrpcApi(IndexApiGrpc.class, ServicePartition.multi()), IndexApiGrpc::newBlockingStub); + this.blacklist = blacklist; } + private static final Comparator comparator = + Comparator.comparing(RpcDecoratedResultItem::getRankingScore); + + + /** Execute a query on the index partitions and return the combined results. */ @SneakyThrows public List executeQueries(RpcIndexQuery indexRequest) { var futures = channelPool.call(IndexApiGrpc.IndexApiBlockingStub::query) .async(executor) .runEach(indexRequest); - List results = new ArrayList<>(); + + final int resultsTotal = indexRequest.getQueryLimits().getResultsTotal(); + final int resultsUpperBound = resultsTotal * channelPool.getNumNodes(); + + List results = new ArrayList<>(resultsUpperBound); + for (var future : futures) { try { future.get().forEachRemaining(results::add); @@ -46,7 +62,20 @@ public class IndexClient { } } + // Sort the results by ranking score and remove blacklisted domains + results.sort(comparator); + results.removeIf(this::isBlacklisted); + + // Keep only as many results as were requested + if (results.size() > resultsTotal) { + results = results.subList(0, resultsTotal); + } + return results; } + private boolean isBlacklisted(RpcDecoratedResultItem item) { + return blacklist.isBlacklisted(UrlIdCodec.getDomainId(item.getRawItem().getCombinedId())); + } + } diff --git a/code/index/api/java/nu/marginalia/index/api/IndexMqEndpoints.java b/code/index/api/java/nu/marginalia/index/api/IndexMqEndpoints.java index ec618912..343154b2 100644 --- a/code/index/api/java/nu/marginalia/index/api/IndexMqEndpoints.java +++ b/code/index/api/java/nu/marginalia/index/api/IndexMqEndpoints.java @@ -5,7 +5,5 @@ public class IndexMqEndpoints { public static final String INDEX_RERANK = "INDEX-RERANK"; public static final String INDEX_REPARTITION = "INDEX-REPARTITION"; public static final String SWITCH_INDEX = "SWITCH-INDEX"; - public static final String SWITCH_LINKDB = "SWITCH_LINKDB"; - } diff --git a/code/index/build.gradle b/code/index/build.gradle index 574c27d8..ad1d1000 100644 --- a/code/index/build.gradle +++ b/code/index/build.gradle @@ -15,12 +15,15 @@ dependencies { implementation 'org.jgrapht:jgrapht-core:1.5.2' implementation project(':third-party:commons-codec') + implementation project(':third-party:parquet-floor') implementation project(':code:index:api') implementation project(':code:functions:link-graph:api') implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:coded-sequence') + implementation project(':code:libraries:language-processing') implementation project(':code:common:db') implementation project(':code:common:config') @@ -28,14 +31,16 @@ dependencies { implementation project(':code:common:linkdb') implementation project(':code:common:service') - implementation project(':code:functions:search-query:api') + implementation project(':code:processes:converting-process:model') + implementation project(':code:functions:search-query:api') implementation project(':code:index:index-forward') implementation project(':code:index:index-reverse') implementation project(':code:index:query') implementation project(':code:index:index-journal') + implementation libs.slop implementation libs.bundles.slf4j implementation libs.prometheus @@ -66,9 +71,11 @@ dependencies { testImplementation project(':code:libraries:array') testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') testImplementation project(':code:libraries:term-frequency-dict') testImplementation project(':code:libraries:braille-block-punch-cards') + testImplementation project(':code:libraries:test-helpers') } diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index cf453e73..946ef74b 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -15,10 +15,13 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:coded-sequence') + implementation project(':code:libraries:language-processing') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') implementation project(':code:common:process') + implementation project(':code:processes:converting-process:model') implementation libs.bundles.slf4j @@ -26,7 +29,9 @@ dependencies { implementation libs.roaringbitmap implementation libs.fastutil implementation libs.trove + implementation libs.slop + testImplementation project(':code:libraries:test-helpers') testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java deleted file mode 100644 index 80cf502b..00000000 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ /dev/null @@ -1,127 +0,0 @@ -package nu.marginalia.index.forward; - -import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; -import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.array.LongArray; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.process.control.ProcessHeartbeat; -import org.roaringbitmap.longlong.LongConsumer; -import org.roaringbitmap.longlong.Roaring64Bitmap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; - -public class ForwardIndexConverter { - - private final ProcessHeartbeat heartbeat; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - private final IndexJournalReader journalReader; - private final Path outputFileDocsId; - private final Path outputFileDocsData; - private final DomainRankings domainRankings; - - - public ForwardIndexConverter(ProcessHeartbeat heartbeat, - IndexJournalReader journalReader, - Path outputFileDocsId, - Path outputFileDocsData, - DomainRankings domainRankings - ) { - this.heartbeat = heartbeat; - this.journalReader = journalReader; - this.outputFileDocsId = outputFileDocsId; - this.outputFileDocsData = outputFileDocsData; - this.domainRankings = domainRankings; - } - - public enum TaskSteps { - GET_DOC_IDS, - GATHER_OFFSETS, - SUPPLEMENTAL_INDEXES, - FORCE, - FINISHED - } - - public void convert() throws IOException { - deleteOldFiles(); - - logger.info("Domain Rankings size = {}", domainRankings.size()); - - try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) { - progress.progress(TaskSteps.GET_DOC_IDS); - - LongArray docsFileId = getDocIds(outputFileDocsId, journalReader); - - progress.progress(TaskSteps.GATHER_OFFSETS); - - // doc ids -> sorted list of ids - - Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size()); - docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos)); - - progress.progress(TaskSteps.SUPPLEMENTAL_INDEXES); - - // docIdToIdx -> file offset for id - - LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); - - var pointer = journalReader.newPointer(); - while (pointer.nextDocument()) { - long docId = pointer.documentId(); - int domainId = UrlIdCodec.getDomainId(docId); - - long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId); - - int ranking = domainRankings.getRanking(domainId); - long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking); - - docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); - docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures()); - } - - progress.progress(TaskSteps.FORCE); - - docFileData.force(); - docsFileId.force(); - - docFileData.close(); - docsFileId.close(); - - progress.progress(TaskSteps.FINISHED); - } catch (IOException ex) { - logger.error("Failed to convert", ex); - throw ex; - } - } - - private LongArray getDocIds(Path outputFileDocs, IndexJournalReader journalReader) throws IOException { - Roaring64Bitmap rbm = new Roaring64Bitmap(); - journalReader.forEachDocId(rbm::add); - - LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality()); - rbm.forEach(new LongConsumer() { - int offset; - @Override - public void accept(long value) { - ret.set(offset++, value); - } - }); - - return ret; - } - - private void deleteOldFiles() throws IOException { - Files.deleteIfExists(outputFileDocsId); - Files.deleteIfExists(outputFileDocsData); - } - -} - diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java index e16e8618..6231256e 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexFileNames.java @@ -13,6 +13,10 @@ public class ForwardIndexFileNames { case NEXT -> basePath.resolve("fwd-doc-data.dat.next"); case CURRENT -> basePath.resolve("fwd-doc-data.dat"); }; + case SPANS_DATA -> switch (version) { + case NEXT -> basePath.resolve("fwd-spans.dat.next"); + case CURRENT -> basePath.resolve("fwd-spans.dat"); + }; }; } @@ -23,6 +27,7 @@ public class ForwardIndexFileNames { public enum FileIdentifier { DOC_DATA, + SPANS_DATA, DOC_ID } } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java index 0b306050..0d9eea61 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,8 +1,8 @@ package nu.marginalia.index.forward; -class ForwardIndexParameters { - public static final int ENTRY_SIZE = 2; +public class ForwardIndexParameters { + public static final int ENTRY_SIZE = 3; public static final int METADATA_OFFSET = 0; public static final int FEATURES_OFFSET = 1; - + public static final int SPANS_OFFSET = 2; } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java index 5d26de82..216ed78d 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -1,13 +1,15 @@ package nu.marginalia.index.forward; -import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.forward.spans.DocumentSpans; +import nu.marginalia.index.forward.spans.ForwardIndexSpansReader; import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.foreign.Arena; import java.nio.file.Files; import java.nio.file.Path; @@ -26,41 +28,47 @@ import static nu.marginalia.index.forward.ForwardIndexParameters.*; * The metadata is a binary encoding of {@see nu.marginalia.idx.DocumentMetadata} */ public class ForwardIndexReader { - private final TLongIntHashMap idToOffset; + private final LongArray ids; private final LongArray data; + private final ForwardIndexSpansReader spansReader; + private final Logger logger = LoggerFactory.getLogger(getClass()); - public ForwardIndexReader(Path idsFile, Path dataFile) throws IOException { + public ForwardIndexReader(Path idsFile, + Path dataFile, + Path spansFile) throws IOException { if (!Files.exists(dataFile)) { logger.warn("Failed to create ForwardIndexReader, {} is absent", dataFile); - idToOffset = null; + ids = null; data = null; + spansReader = null; return; } else if (!Files.exists(idsFile)) { logger.warn("Failed to create ForwardIndexReader, {} is absent", idsFile); - idToOffset = null; + ids = null; data = null; + spansReader = null; + return; + } + else if (!Files.exists(spansFile)) { + logger.warn("Failed to create ForwardIndexReader, {} is absent", spansFile); + ids = null; + data = null; + spansReader = null; return; } logger.info("Switching forward index"); - idToOffset = loadIds(idsFile); + ids = loadIds(idsFile); data = loadData(dataFile); + spansReader = new ForwardIndexSpansReader(spansFile); } - private static TLongIntHashMap loadIds(Path idsFile) throws IOException { - try (var idsArray = LongArrayFactory.mmapForReadingShared(idsFile)) { - assert idsArray.size() < Integer.MAX_VALUE; - - var ids = new TLongIntHashMap((int) idsArray.size(), 0.5f, -1, -1); - // This hash table should be of the same size as the number of documents, so typically less than 1 Gb - idsArray.forEach(0, idsArray.size(), (pos, val) -> ids.put(val, (int) pos)); - - return ids; - } + private static LongArray loadIds(Path idsFile) throws IOException { + return LongArrayFactory.mmapForReadingShared(idsFile); } private static LongArray loadData(Path dataFile) throws IOException { @@ -82,25 +90,52 @@ public class ForwardIndexReader { long offset = idxForDoc(docId); if (offset < 0) return 0; - return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET); + return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) & 0xFFFF_FFFFL); } + public int getDocumentSize(long docId) { + assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; + + long offset = idxForDoc(docId); + if (offset < 0) return 0; + + return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) >>> 32L); + } + + private int idxForDoc(long docId) { assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id"; - if (getClass().desiredAssertionStatus()) { - long offset = idToOffset.get(docId); - if (offset < 0) { // Ideally we'd always check this, but this is a very hot method + long offset = ids.binarySearch(docId, 0, ids.size()); + + if (offset >= ids.size() || offset < 0 || ids.get(offset) != docId) { + if (getClass().desiredAssertionStatus()) { logger.warn("Could not find offset for doc {}", docId); } + return -1; } - return idToOffset.get(docId); + return (int) offset; + } + + public DocumentSpans getDocumentSpans(Arena arena, long docId) { + long offset = idxForDoc(docId); + if (offset < 0) return new DocumentSpans(); + + long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET); + + try { + return spansReader.readSpans(arena, encodedOffset); + } + catch (IOException ex) { + logger.error("Failed to read spans for doc " + docId, ex); + return new DocumentSpans(); + } } public int totalDocCount() { - return idToOffset.size(); + return (int) ids.size(); } public void close() { diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java new file mode 100644 index 00000000..43f7371c --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java @@ -0,0 +1,181 @@ +package nu.marginalia.index.forward.construction; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.forward.ForwardIndexParameters; +import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; +import nu.marginalia.slop.column.primitive.LongColumn; +import org.roaringbitmap.longlong.LongConsumer; +import org.roaringbitmap.longlong.Roaring64Bitmap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +public class ForwardIndexConverter { + + private final ProcessHeartbeat heartbeat; + + private final Logger logger = LoggerFactory.getLogger(getClass()); + + private final Path outputFileDocsId; + private final Path outputFileDocsData; + private final DomainRankings domainRankings; + + private final Path outputFileSpansData; + private final IndexJournal journal; + + public ForwardIndexConverter(ProcessHeartbeat heartbeat, + Path outputFileDocsId, + Path outputFileDocsData, + Path outputFileSpansData, + IndexJournal journal, + DomainRankings domainRankings + ) { + this.heartbeat = heartbeat; + this.outputFileDocsId = outputFileDocsId; + this.outputFileDocsData = outputFileDocsData; + this.outputFileSpansData = outputFileSpansData; + this.journal = journal; + this.domainRankings = domainRankings; + } + + public enum TaskSteps { + GET_DOC_IDS, + GATHER_OFFSETS, + SUPPLEMENTAL_INDEXES, + FORCE, + FINISHED + } + + public void convert() throws IOException { + deleteOldFiles(); + + logger.info("Domain Rankings size = {}", domainRankings.size()); + + try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter"); + var spansWriter = new ForwardIndexSpansWriter(outputFileSpansData) + ) { + progress.progress(TaskSteps.GET_DOC_IDS); + + LongArray docsFileId = getDocIds(outputFileDocsId, journal); + + progress.progress(TaskSteps.GATHER_OFFSETS); + + // doc ids -> sorted list of ids + + Long2IntOpenHashMap docIdToIdx = new Long2IntOpenHashMap((int) docsFileId.size()); + docsFileId.forEach(0, docsFileId.size(), (pos, val) -> docIdToIdx.put(val, (int) pos)); + + progress.progress(TaskSteps.SUPPLEMENTAL_INDEXES); + + // docIdToIdx -> file offset for id + + LongArray docFileData = LongArrayFactory.mmapForWritingConfined(outputFileDocsData, ForwardIndexParameters.ENTRY_SIZE * docsFileId.size()); + + ByteBuffer workArea = ByteBuffer.allocate(65536); + for (var instance : journal.pages()) { + try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) + { + var docIdReader = instance.openCombinedId(slopTable); + var metaReader = instance.openDocumentMeta(slopTable); + var featuresReader = instance.openFeatures(slopTable); + var sizeReader = instance.openSize(slopTable); + + var spansCodesReader = instance.openSpanCodes(slopTable); + var spansSeqReader = instance.openSpans(slopTable); + + while (docIdReader.hasRemaining()) { + long docId = docIdReader.get(); + int domainId = UrlIdCodec.getDomainId(docId); + + long entryOffset = (long) ForwardIndexParameters.ENTRY_SIZE * docIdToIdx.get(docId); + + int ranking = domainRankings.getRanking(domainId); + long meta = DocumentMetadata.encodeRank(metaReader.get(), ranking); + + final int docFeatures = featuresReader.get(); + final int docSize = sizeReader.get(); + + long features = docFeatures | ((long) docSize << 32L); + + // Write spans data + byte[] spansCodes = spansCodesReader.get(); + + spansWriter.beginRecord(spansCodes.length); + workArea.clear(); + List spans = spansSeqReader.getData(workArea); + + for (int i = 0; i < spansCodes.length; i++) { + spansWriter.writeSpan(spansCodes[i], spans.get(i)); + } + long encodedSpansOffset = spansWriter.endRecord(); + + + // Write the principal forward documents file + docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta); + docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features); + docFileData.set(entryOffset + ForwardIndexParameters.SPANS_OFFSET, encodedSpansOffset); + + } + } + } + + progress.progress(TaskSteps.FORCE); + + docFileData.force(); + docsFileId.force(); + + docFileData.close(); + docsFileId.close(); + + progress.progress(TaskSteps.FINISHED); + } catch (IOException ex) { + logger.error("Failed to convert", ex); + throw ex; + } + } + + private LongArray getDocIds(Path outputFileDocs, IndexJournal journalReader) throws IOException { + Roaring64Bitmap rbm = new Roaring64Bitmap(); + + for (var instance : journalReader.pages()) { + try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) { + LongColumn.Reader idReader = instance.openCombinedId(slopTable); + + while (idReader.hasRemaining()) { + rbm.add(idReader.get()); + } + } + } + + LongArray ret = LongArrayFactory.mmapForWritingConfined(outputFileDocs, rbm.getIntCardinality()); + rbm.forEach(new LongConsumer() { + int offset; + @Override + public void accept(long value) { + ret.set(offset++, value); + } + }); + + return ret; + } + + private void deleteOldFiles() throws IOException { + Files.deleteIfExists(outputFileDocsId); + Files.deleteIfExists(outputFileDocsData); + } + +} + diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java new file mode 100644 index 00000000..f1b32135 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -0,0 +1,215 @@ +package nu.marginalia.index.forward.spans; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.sequence.CodedSequence; + +import java.util.Arrays; + +/** A list of the interlaced start and end positions of each span in the document of this type */ +public class DocumentSpan { + + /** A list of the interlaced start and end positions of each span in the document of this type */ + private final IntList startsEnds; + + public DocumentSpan(CodedSequence startsEnds) { + this.startsEnds = startsEnds.values(); + } + + public DocumentSpan() { + this.startsEnds = null; + } + + /** Counts the number of intersections between the spans in the document of this type and the given list of positions */ + public int countIntersections(int[] positions) { + if (null == startsEnds || startsEnds.isEmpty() || positions.length == 0) { + return 0; + } + + int cnt = 0; + + if (positions.length < 8) { // for small arrays we can do a linear search + int seis = 0; + + for (int pi = 0; pi < positions.length; pi++) { + int position = positions[pi]; + + // search through the spans until we find an item that is greater than the given position + for (int sei = seis; sei < startsEnds.size(); sei ++) { + if (startsEnds.getInt(sei) > position) { + cnt += sei % 2; // if sei is odd, we are between a start and end position in the spans list + seis = Math.max(seis, sei - 1); + break; + } + } + } + } + else { // for large arrays we use a binary search + int searchStart = 0; + + for (int sei = 0; sei < startsEnds.size() && searchStart < positions.length; ) { + int start = startsEnds.getInt(sei++); + int end = startsEnds.getInt(sei++); + + // find the first position that is greater or equal to the start position + int i = Arrays.binarySearch(positions, searchStart, positions.length, start); + if (i < 0) i = -i - 1; // if the position is not found, we get the insertion point + + // ... from that point, count the number of positions that smaller than the end position + while (i < positions.length && positions[i] < end) { + cnt++; + i++; + } + searchStart = i; + } + } + + return cnt; + } + + public boolean containsPosition(int position) { + if (startsEnds == null) { + return false; + } + + var iter = startsEnds.iterator(); + while (iter.hasNext()) { + int start = iter.nextInt(); + if (start > position) { + return false; + } + int end = iter.nextInt(); + if (end > position) { + return true; + } + } + + return false; + } + + /** Returns true if for any position in the list, there exists a range + * (position[i], position[i]+len] that is overlapped by a span */ + public boolean containsRange(IntList positions, int len) { + if (null == startsEnds || startsEnds.size() < 2 || positions.isEmpty()) { + return false; + } + + int sei = 0; + + + int start = startsEnds.getInt(sei++); + int end = startsEnds.getInt(sei++); + + for (int pi = 0; pi < positions.size(); pi++) { + int position = positions.getInt(pi); + if (position < start) { + continue; + } + + if (position + len < end) { + return true; + } else if (sei + 2 < startsEnds.size()) { + start = startsEnds.getInt(sei++); + end = startsEnds.getInt(sei++); + } + else { + return false; + } + } + + return false; + } + + /** Returns an iterator over each position between the start and end positions of each span in the document of this type */ + public IntIterator iterator() { + if (null == startsEnds) { + return IntList.of().iterator(); + } + + return new DocumentSpanPositionsIterator(); + } + + /** Returns a list with all values between the start and end positions of each span in the document of this type + * This is an expensive operation and should not be used in the main execution path, but only for debugging + * and testing + * */ + public IntList positionValues() { + if (null == startsEnds) + return IntList.of(); + + IntList ret = new IntArrayList(); + var iter = startsEnds.iterator(); + while (iter.hasNext()) { + ret.add(iter.nextInt()); + } + return ret; + } + + /** Iteator over the values between the start and end positions of each span in the document of this type + * */ + class DocumentSpanPositionsIterator implements IntIterator { + private final IntIterator startStopIterator; + + private int value = -1; + private int current = -1; + private int end = -1; + + public DocumentSpanPositionsIterator() { + this.startStopIterator = startsEnds.iterator(); + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = value; + value = -1; + return ret; + } + throw new IllegalStateException(); + } + + @Override + public boolean hasNext() { + if (value >= 0) { + return true; + } + else if (current >= 0 && current < end) { + value = ++current; + return true; + } + else if (startStopIterator.hasNext()) { + current = startStopIterator.nextInt(); + end = startStopIterator.nextInt(); + value = current; + return true; + } + + return false; + } + } + + public int length() { + if (null == startsEnds) { + return 0; + } + + int len = 0; + var iter = startsEnds.iterator(); + + while (iter.hasNext()) { + len -= iter.nextInt(); + len += iter.nextInt(); + } + + return len; + } + + public int size() { + if (null == startsEnds) { + return 0; + } + + return startsEnds.size() / 2; + } +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java new file mode 100644 index 00000000..2db9dfeb --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java @@ -0,0 +1,59 @@ +package nu.marginalia.index.forward.spans; + +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.sequence.CodedSequence; + +/** All spans associated with a document + *

+ * A span is a list of document positions that are associated with a particular tag in the document. + * */ +public class DocumentSpans { + private static final DocumentSpan EMPTY_SPAN = new DocumentSpan(); + + public DocumentSpan title = EMPTY_SPAN; + public DocumentSpan heading = EMPTY_SPAN; + public DocumentSpan body = EMPTY_SPAN; + + public DocumentSpan nav = EMPTY_SPAN; + public DocumentSpan code = EMPTY_SPAN; + public DocumentSpan anchor = EMPTY_SPAN; + + public DocumentSpan externalLinkText = EMPTY_SPAN; + + public DocumentSpan getSpan(HtmlTag tag) { + if (tag == HtmlTag.HEADING) + return heading; + else if (tag == HtmlTag.TITLE) + return title; + else if (tag == HtmlTag.NAV) + return nav; + else if (tag == HtmlTag.CODE) + return code; + else if (tag == HtmlTag.ANCHOR) + return anchor; + else if (tag == HtmlTag.EXTERNAL_LINKTEXT) + return externalLinkText; + else if (tag == HtmlTag.BODY) + return body; + + return EMPTY_SPAN; + } + + void accept(byte code, CodedSequence positions) { + if (code == HtmlTag.HEADING.code) + this.heading = new DocumentSpan(positions); + else if (code == HtmlTag.TITLE.code) + this.title = new DocumentSpan(positions); + else if (code == HtmlTag.NAV.code) + this.nav = new DocumentSpan(positions); + else if (code == HtmlTag.CODE.code) + this.code = new DocumentSpan(positions); + else if (code == HtmlTag.ANCHOR.code) + this.anchor = new DocumentSpan(positions); + else if (code == HtmlTag.EXTERNAL_LINKTEXT.code) + this.externalLinkText = new DocumentSpan(positions); + else if (code == HtmlTag.BODY.code) + this.body = new DocumentSpan(positions); + } + +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansReader.java new file mode 100644 index 00000000..b99742c5 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansReader.java @@ -0,0 +1,59 @@ +package nu.marginalia.index.forward.spans; + +import nu.marginalia.sequence.VarintCodedSequence; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +@SuppressWarnings("preview") +public class ForwardIndexSpansReader implements AutoCloseable { + private final FileChannel spansFileChannel; + + public ForwardIndexSpansReader(Path spansFile) throws IOException { + this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ); + } + + public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException { + // Decode the size and offset from the encoded offset + long size = SpansCodec.decodeSize(encodedOffset); + long offset = SpansCodec.decodeStartOffset(encodedOffset); + + // Allocate a buffer from the arena + var buffer = arena.allocate(size).asByteBuffer(); + buffer.clear(); + while (buffer.hasRemaining()) { + spansFileChannel.read(buffer, offset + buffer.position()); + } + buffer.flip(); + + // Read the number of spans in the document + int count = buffer.get(); + + DocumentSpans ret = new DocumentSpans(); + + // Decode each span + while (count-- > 0) { + byte code = buffer.get(); + short len = buffer.getShort(); + + ByteBuffer data = buffer.slice(buffer.position(), len); + ret.accept(code, new VarintCodedSequence(data)); + + // Reset the buffer position to the end of the span + buffer.position(buffer.position() + len); + } + + return ret; + } + + @Override + public void close() throws IOException { + spansFileChannel.close(); + } + +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansWriter.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansWriter.java new file mode 100644 index 00000000..4bdebd59 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansWriter.java @@ -0,0 +1,52 @@ +package nu.marginalia.index.forward.spans; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class ForwardIndexSpansWriter implements AutoCloseable { + private final FileChannel outputChannel; + private final ByteBuffer work = ByteBuffer.allocate(32); + + private long stateStartOffset = -1; + private int stateLength = -1; + + public ForwardIndexSpansWriter(Path outputFileSpansData) throws IOException { + this.outputChannel = (FileChannel) Files.newByteChannel(outputFileSpansData, StandardOpenOption.READ, StandardOpenOption.WRITE, StandardOpenOption.CREATE); + } + + public void beginRecord(int count) throws IOException { + stateStartOffset = outputChannel.position(); + stateLength = 0; + + work.clear(); + work.put((byte) count); + work.flip(); + + while (work.hasRemaining()) + stateLength += outputChannel.write(work); + } + + public void writeSpan(byte spanCode, ByteBuffer sequenceData) throws IOException { + work.clear(); + work.put(spanCode); + work.putShort((short) sequenceData.remaining()); + work.flip(); + + while (work.hasRemaining() || sequenceData.hasRemaining()) { + stateLength += (int) outputChannel.write(new ByteBuffer[]{work, sequenceData}); + } + } + + public long endRecord() { + return SpansCodec.encode(stateStartOffset, stateLength); + } + + @Override + public void close() throws IOException { + outputChannel.close(); + } +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java new file mode 100644 index 00000000..7330f593 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java @@ -0,0 +1,17 @@ +package nu.marginalia.index.forward.spans; + +public class SpansCodec { + public static long encode(long startOffset, long size) { + assert size < 0x1000_0000L : "Size must be less than 2^28"; + + return startOffset << 28 | (size & 0xFFF_FFFFL); + } + + public static long decodeStartOffset(long encoded) { + return encoded >>> 28; + } + + public static long decodeSize(long encoded) { + return encoded & 0x0FFF_FFFFL; + } +} diff --git a/code/index/index-forward/readme.md b/code/index/index-forward/readme.md index 39e272e5..58dadfc3 100644 --- a/code/index/index-forward/readme.md +++ b/code/index/index-forward/readme.md @@ -17,5 +17,5 @@ so it's relatively easy to construct. ## Central Classes -* [ForwardIndexConverter](java/nu/marginalia/index/forward/ForwardIndexConverter.java) constructs the index. +* [ForwardIndexConverter](java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java) constructs the index. * [ForwardIndexReader](java/nu/marginalia/index/forward/ForwardIndexReader.java) interrogates the index. \ No newline at end of file diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java index de571664..59026876 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -2,11 +2,11 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.journal.model.IndexJournalEntry; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.test.TestUtil; import org.junit.jupiter.api.AfterEach; @@ -18,91 +18,100 @@ import org.slf4j.LoggerFactory; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.stream.IntStream; +import java.util.List; import static org.junit.jupiter.api.Assertions.assertEquals; class ForwardIndexConverterTest { - IndexJournalWriter writer; + IndexJournalSlopWriter writer; - Path indexFile; Path wordsFile1; Path urlsFile1; Path dictionaryFile; + Path workDir; + private final Logger logger = LoggerFactory.getLogger(getClass()); Path dataDir; private Path docsFileId; private Path docsFileData; + private Path docsSpanData; int workSetSize = 512; @BeforeEach @SneakyThrows void setUp() { + + workDir = Files.createTempDirectory(getClass().getSimpleName()); + dictionaryFile = Files.createTempFile("tmp", ".dict"); dictionaryFile.toFile().deleteOnExit(); - indexFile = Files.createTempFile("tmp", ".idx"); - indexFile.toFile().deleteOnExit(); - writer = new IndexJournalWriterSingleFileImpl(indexFile); - wordsFile1 = Files.createTempFile("words1", ".idx"); urlsFile1 = Files.createTempFile("urls1", ".idx"); dataDir = Files.createTempDirectory(getClass().getSimpleName()); - for (int i = 1; i < workSetSize; i++) { - createEntry(writer, i); + try (var writer = new IndexJournalSlopWriter(IndexJournal.allocateName(workDir), 0)) { + for (int i = 1; i < workSetSize; i++) { + createEntry(writer, i); + } } - writer.close(); - - docsFileId = dataDir.resolve("docs-i.dat"); docsFileData = dataDir.resolve("docs-d.dat"); + docsSpanData = dataDir.resolve("docs-s.dat"); } @AfterEach public void tearDown() { TestUtil.clearTempDir(dataDir); - } - - public int[] getFactorsI(int id) { - return IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + TestUtil.clearTempDir(workDir); } long createId(long url, long domain) { return UrlIdCodec.encodeId((int) domain, (int) url); } - public void createEntry(IndexJournalWriter writer, int id) { - int[] factors = getFactorsI(id); + public void createEntry(IndexJournalSlopWriter writer, int id) { + writer.put( + createId(id, id/20), + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + id%3, + id%5, + 15, + List.of(), + new byte[0], + List.of(), + new byte[0], + List.of() + ) + ); - var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5); - for (int i = 0; i+1 < factors.length; i+=2) { - entryBuilder.add(factors[i], -factors[i+1]); - } - - writer.put(entryBuilder.build()); } @Test void testForwardIndex() throws IOException { - new ForwardIndexConverter(new FakeProcessHeartbeat(), new IndexJournalReaderSingleFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert(); + new ForwardIndexConverter(new FakeProcessHeartbeat(), + docsFileId, + docsFileData, + docsSpanData, + IndexJournal.findJournal(workDir).orElseThrow(), + new DomainRankings()).convert(); - var forwardReader = new ForwardIndexReader(docsFileId, docsFileData); + var forwardReader = new ForwardIndexReader(docsFileId, docsFileData, docsSpanData); for (int i = 36; i < workSetSize; i++) { long docId = createId(i, i/20); assertEquals(0x00FF000000000000L | (i % 5), forwardReader.getDocMeta(docId)); + assertEquals((i % 3), forwardReader.getHtmlFeatures(docId)); assertEquals(i/20, UrlIdCodec.getDomainId(docId)); } - } - - } \ No newline at end of file diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java new file mode 100644 index 00000000..b668d1c7 --- /dev/null +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -0,0 +1,69 @@ +package nu.marginalia.index.forward; + +import nu.marginalia.index.forward.spans.ForwardIndexSpansReader; +import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.sequence.VarintCodedSequence; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class ForwardIndexSpansReaderTest { + Path testFile = Files.createTempFile("test", ".idx"); + + ForwardIndexSpansReaderTest() throws IOException { + } + + @AfterEach + public void tearDown() throws IOException { + Files.deleteIfExists(testFile); + } + + @Test + void testSunnyDay() throws IOException { + ByteBuffer wa = ByteBuffer.allocate(32); + + long offset1; + long offset2; + try (var writer = new ForwardIndexSpansWriter(testFile)) { + writer.beginRecord(1); + writer.writeSpan(HtmlTag.HEADING.code, VarintCodedSequence.generate(1, 3, 5, 8).buffer()); + offset1 = writer.endRecord(); + + writer.beginRecord(2); + writer.writeSpan(HtmlTag.CODE.code, VarintCodedSequence.generate(2, 4, 6, 7).buffer()); + writer.writeSpan(HtmlTag.ANCHOR.code, VarintCodedSequence.generate(3, 5).buffer()); + offset2 = writer.endRecord(); + } + + try (var reader = new ForwardIndexSpansReader(testFile); + var arena = Arena.ofConfined() + ) { + var spans1 = reader.readSpans(arena, offset1); + var spans2 = reader.readSpans(arena, offset2); + + assertEquals(2, spans1.heading.size()); + + assertEquals(2, spans2.code.size()); + + assertFalse(spans2.code.containsPosition(1)); + assertTrue(spans2.code.containsPosition(3)); + assertFalse(spans2.code.containsPosition(5)); + assertTrue(spans2.code.containsPosition(6)); + assertFalse(spans2.code.containsPosition(7)); + assertFalse(spans2.code.containsPosition(8)); + + assertEquals(1, spans2.anchor.size()); + + assertEquals(0, spans2.title.size()); + assertFalse(spans2.title.containsPosition(8)); + } + } +} \ No newline at end of file diff --git a/code/index/index-forward/test/nu/marginalia/test/TestUtil.java b/code/index/index-forward/test/nu/marginalia/test/TestUtil.java deleted file mode 100644 index 8fbf6b54..00000000 --- a/code/index/index-forward/test/nu/marginalia/test/TestUtil.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path dir) { - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/index/index-journal/build.gradle b/code/index/index-journal/build.gradle index 5380c0be..012f027f 100644 --- a/code/index/index-journal/build.gradle +++ b/code/index/index-journal/build.gradle @@ -13,16 +13,22 @@ java { apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { + implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:array') implementation project(':code:common:model') + implementation project(':code:processes:converting-process:model') + implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j + implementation libs.slop implementation libs.prometheus implementation libs.notnull implementation libs.guava implementation libs.trove implementation libs.zstd + implementation libs.fastutil implementation libs.commons.lang3 implementation libs.roaringbitmap diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java new file mode 100644 index 00000000..2f3294e2 --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournal.java @@ -0,0 +1,50 @@ +package nu.marginalia.index.journal; + +import nu.marginalia.slop.SlopTable; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; + +public record IndexJournal(Path journalDir) { + + public static final String JOURNAL_FILE_NAME = "index-journal"; + + public static Path allocateName(Path base) { + return base.resolve(JOURNAL_FILE_NAME); + } + + /** Returns the journal file in the base directory. */ + public static Optional findJournal(Path baseDirectory) { + Path journal = baseDirectory.resolve(JOURNAL_FILE_NAME); + if (Files.isDirectory(journal)) { + return Optional.of(new IndexJournal(journal)); + } + return Optional.empty(); + } + + /** Returns the number of versions of the journal file in the base directory. */ + public static int numPages(Path baseDirectory) { + return SlopTable.getNumPages(baseDirectory, IndexJournalPage.combinedId); + } + + public IndexJournal { + if (!journalDir.toFile().isDirectory()) { + throw new IllegalArgumentException("Invalid journal directory: " + journalDir); + } + } + + public List pages() { + int pages = numPages(journalDir); + + List instances = new ArrayList<>(pages); + + for (int version = 0; version < pages; version++) { + instances.add(new IndexJournalPage(journalDir, version)); + } + + return instances; + } +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java deleted file mode 100644 index 8702be34..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalFileNames.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.index.journal; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class IndexJournalFileNames { - public static Path allocateName(Path base, int idx) { - return base.resolve(String.format("page-index-%04d.dat", idx)); - } - - public static List findJournalFiles(Path baseDirectory) throws IOException { - List ret = new ArrayList<>(); - - try (var listStream = Files.list(baseDirectory)) { - listStream - .filter(IndexJournalFileNames::isJournalFile) - .sorted() - .forEach(ret::add); - } - - return ret; - } - - public static boolean isJournalFile(Path file) { - return file.toFile().getName().matches("page-index-\\d{4}.dat"); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java new file mode 100644 index 00000000..5732d1c0 --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalPage.java @@ -0,0 +1,69 @@ +package nu.marginalia.index.journal; + +import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn; +import nu.marginalia.slop.SlopTable; +import nu.marginalia.slop.column.array.ByteArrayColumn; +import nu.marginalia.slop.column.array.LongArrayColumn; +import nu.marginalia.slop.column.primitive.IntColumn; +import nu.marginalia.slop.column.primitive.LongColumn; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.file.Path; + +public record IndexJournalPage(Path baseDir, int page) { + public static IntColumn features = new IntColumn("features", StorageType.PLAIN); + public static IntColumn size = new IntColumn("size", StorageType.PLAIN); + public static LongColumn combinedId = new LongColumn("combinedId", StorageType.PLAIN); + public static LongColumn documentMeta = new LongColumn("documentMeta", StorageType.PLAIN); + + public static LongArrayColumn termIds = new LongArrayColumn("termIds", StorageType.ZSTD); + public static ByteArrayColumn termMeta = new ByteArrayColumn("termMetadata", StorageType.ZSTD); + public static VarintCodedSequenceArrayColumn positions = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD); + + public static ByteArrayColumn spanCodes = new ByteArrayColumn("spanCodes", StorageType.ZSTD); + public static VarintCodedSequenceArrayColumn spans = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD); + + public IndexJournalPage { + if (!baseDir.toFile().isDirectory()) { + throw new IllegalArgumentException("Invalid base directory: " + baseDir); + } + } + + public LongColumn.Reader openCombinedId(SlopTable table) throws IOException { + return combinedId.open(table); + } + + public LongColumn.Reader openDocumentMeta(SlopTable table) throws IOException { + return documentMeta.open(table); + } + + public IntColumn.Reader openFeatures(SlopTable table) throws IOException { + return features.open(table); + } + + public IntColumn.Reader openSize(SlopTable table) throws IOException { + return size.open(table); + } + + + public LongArrayColumn.Reader openTermIds(SlopTable table) throws IOException { + return termIds.open(table); + } + + public ByteArrayColumn.Reader openTermMetadata(SlopTable table) throws IOException { + return termMeta.open(table); + } + + public VarintCodedSequenceArrayColumn.Reader openTermPositions(SlopTable table) throws IOException { + return positions.open(table); + } + + public VarintCodedSequenceArrayColumn.Reader openSpans(SlopTable table) throws IOException { + return spans.open(table); + } + + public ByteArrayColumn.Reader openSpanCodes(SlopTable table) throws IOException { + return spanCodes.open(table); + } +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java new file mode 100644 index 00000000..44d68979 --- /dev/null +++ b/code/index/index-journal/java/nu/marginalia/index/journal/IndexJournalSlopWriter.java @@ -0,0 +1,95 @@ +package nu.marginalia.index.journal; + +import lombok.SneakyThrows; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn; +import nu.marginalia.slop.SlopTable; +import nu.marginalia.slop.column.array.ByteArrayColumn; +import nu.marginalia.slop.column.array.LongArrayColumn; +import nu.marginalia.slop.column.primitive.IntColumn; +import nu.marginalia.slop.column.primitive.LongColumn; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +public class IndexJournalSlopWriter extends SlopTable { + + private final IntColumn.Writer featuresWriter; + private final IntColumn.Writer sizeWriter; + private final LongColumn.Writer combinedIdWriter; + private final LongColumn.Writer documentMetaWriter; + + private final LongArrayColumn.Writer termIdsWriter; + private final ByteArrayColumn.Writer termMetadataWriter; + private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter; + + private final VarintCodedSequenceArrayColumn.Writer spansWriter; + private final ByteArrayColumn.Writer spanCodesWriter; + + private static final MurmurHash3_128 hash = new MurmurHash3_128(); + + public IndexJournalSlopWriter(Path dir, int page) throws IOException { + + super(dir, page); + + if (!Files.exists(dir)) { + Files.createDirectory(dir); + } + + featuresWriter = IndexJournalPage.features.create(this); + sizeWriter = IndexJournalPage.size.create(this); + + combinedIdWriter = IndexJournalPage.combinedId.create(this); + documentMetaWriter = IndexJournalPage.documentMeta.create(this); + + termIdsWriter = IndexJournalPage.termIds.create(this); + termMetadataWriter = IndexJournalPage.termMeta.create(this); + termPositionsWriter = IndexJournalPage.positions.create(this); + + spanCodesWriter = IndexJournalPage.spanCodes.create(this); + spansWriter = IndexJournalPage.spans.create(this); + } + + @SneakyThrows + public void put(long combinedId, SlopDocumentRecord.KeywordsProjection keywordsProjection) { + + combinedIdWriter.put(combinedId); + featuresWriter.put(keywordsProjection.htmlFeatures()); + sizeWriter.put(keywordsProjection.length()); + documentMetaWriter.put(keywordsProjection.documentMetadata()); + + // -- write keyword data -- + + final List keywords = keywordsProjection.words(); + + // termIds are the special hashes of the keywords + long[] termIds = new long[keywordsProjection.words().size()]; + for (int i = 0; i < termIds.length; i++) { + termIds[i] = hash.hashKeyword(keywords.get(i)); + } + + termIdsWriter.put(termIds); + termPositionsWriter.put(keywordsProjection.positions()); + termMetadataWriter.put(keywordsProjection.metas()); + + // -- write spans -- + + spanCodesWriter.put(keywordsProjection.spanCodes()); + spansWriter.put(keywordsProjection.spans()); + } + + public void close() throws IOException { + featuresWriter.close(); + sizeWriter.close(); + combinedIdWriter.close(); + documentMetaWriter.close(); + termIdsWriter.close(); + termMetadataWriter.close(); + termPositionsWriter.close(); + spansWriter.close(); + spanCodesWriter.close(); + } +} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntry.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntry.java deleted file mode 100644 index 7d4944ac..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntry.java +++ /dev/null @@ -1,27 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.model.id.UrlIdCodec; - -/** An entry in the index journal. - * - * @param header the header of the entry, containing document level data - * @param data the data of the entry, containing keyword level data - * - * @see IndexJournalEntryHeader - * @see IndexJournalEntryData - */ -public record IndexJournalEntry(IndexJournalEntryHeader header, IndexJournalEntryData data) { - - public static IndexJournalEntryBuilder builder(long documentId, long documentMeta) { - return new IndexJournalEntryBuilder(0, documentId, documentMeta); - } - - public static IndexJournalEntryBuilder builder(int domainId, - int urlId, - long documentMeta) { - - - return builder(UrlIdCodec.encodeId(domainId, urlId), documentMeta); - } - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryBuilder.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryBuilder.java deleted file mode 100644 index 6bfa19ea..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryBuilder.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.index.journal.model; - -import gnu.trove.list.array.TLongArrayList; - -public class IndexJournalEntryBuilder { - private final long documentId; - private final int documentFeatures; - private final long documentMeta; - private final TLongArrayList items = new TLongArrayList(); - - public IndexJournalEntryBuilder( - int documentFeatures, - long documentId, - long documentMeta) { - this.documentFeatures = documentFeatures; - this.documentId = documentId; - this.documentMeta = documentMeta; - } - - public IndexJournalEntryBuilder add(long wordId, long metadata) { - - items.add(wordId); - items.add(metadata); - - return this; - } - - public IndexJournalEntry build() { - return new IndexJournalEntry( - new IndexJournalEntryHeader(items.size(), - documentFeatures, - documentId, - documentMeta), - new IndexJournalEntryData(items.toArray()) - ); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java deleted file mode 100644 index 26c10c2a..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryData.java +++ /dev/null @@ -1,77 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.model.idx.WordMetadata; - -import java.util.Arrays; -import java.util.Iterator; - -/** The keyword data of an index journal entry. - * The data itself is an interleaved array of - * word ids and metadata. - *

- * Odd entries are term ids, even entries are encoded WordMetadata records. - *

- *

The civilized way of reading the journal data is to use an IndexJournalReader

- * - * @see WordMetadata - * @see IndexJournalReader - */ -public class IndexJournalEntryData implements Iterable { - private final int size; - public final long[] underlyingArray; - - public static final int MAX_LENGTH = 1000; - public static final int ENTRY_SIZE = 2; - - public IndexJournalEntryData(long[] underlyingArray) { - this.size = underlyingArray.length; - this.underlyingArray = underlyingArray; - } - - public IndexJournalEntryData(int size, long[] underlyingArray) { - this.size = size; - this.underlyingArray = underlyingArray; - } - - public long get(int idx) { - if (idx >= size) - throw new ArrayIndexOutOfBoundsException(idx + " vs " + size); - return underlyingArray[idx]; - } - - public int size() { - return size; - } - public long[] toArray() { - if (size == underlyingArray.length) - return underlyingArray; - else - return Arrays.copyOf(underlyingArray, size); - } - - public String toString() { - return String.format("%s[%s]", getClass().getSimpleName(), Arrays.toString(toArray())); - } - - public Iterator iterator() { - return new EntryIterator(); - } - - private class EntryIterator implements Iterator { - int pos = -ENTRY_SIZE; - - public boolean hasNext() { - return pos + 2*ENTRY_SIZE - 1 < size; - } - - @Override - public Record next() { - pos+=ENTRY_SIZE; - - return new Record(underlyingArray[pos], underlyingArray[pos+1]); - } - } - - public record Record(long wordId, long metadata) {} -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java deleted file mode 100644 index b0f3d41e..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java +++ /dev/null @@ -1,32 +0,0 @@ -package nu.marginalia.index.journal.model; - -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.DocumentMetadata; - -/** The header of an index journal entry. - * - * @param entrySize the size of the entry - * @param documentFeatures the features of the document, as an encoded HtmlFeature - * @param combinedId the combined document id, encoded with UrlIdCodec - * @param documentMeta the metadata of the document, as an encoded DocumentMetadata - * - * @see DocumentMetadata - * @see HtmlFeature - * @see UrlIdCodec - */ -public record IndexJournalEntryHeader(int entrySize, - int documentFeatures, - long combinedId, - long documentMeta) { - - public IndexJournalEntryHeader(long combinedId, - int documentFeatures, - long documentMeta) { - this(-1, - documentFeatures, - combinedId, - documentMeta); - } - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java b/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java deleted file mode 100644 index 7a4ca7e0..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/model/IndexJournalFileHeader.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.index.journal.model; - -/** The header of an index journal file. This is the first 16 bytes of the file, - * and is not compressed. - * - * @param fileSizeRecords the size of the file in number of records - * @param reserved should be 0 - */ -public record IndexJournalFileHeader(long fileSizeRecords, long reserved) { -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java deleted file mode 100644 index 625267d1..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReadEntry.java +++ /dev/null @@ -1,72 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.model.id.UrlIdCodec; - -import java.io.DataInputStream; -import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.LongBuffer; - -public class IndexJournalReadEntry { - public final IndexJournalEntryHeader header; - - private final long[] buffer; - - public IndexJournalReadEntry(IndexJournalEntryHeader header, long[] buffer) { - this.header = header; - this.buffer = buffer; - } - - - record WorkArea(byte[] bytes, LongBuffer buffer) { - WorkArea(byte[] bytes) { - this(bytes, ByteBuffer.wrap(bytes).asLongBuffer()); - } - WorkArea() { - this(new byte[8*65536]); - } - } - - static ThreadLocal pool = ThreadLocal.withInitial(WorkArea::new); - - public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException { - - final long sizeBlock = inputStream.readLong(); - final long docId = inputStream.readLong(); - final long meta = inputStream.readLong(); - - var header = new IndexJournalEntryHeader( - (int) (sizeBlock >>> 32L), - (int) (sizeBlock & 0xFFFF_FFFFL), - docId, - meta); - - var workArea = pool.get(); - inputStream.readFully(workArea.bytes, 0, 8 * header.entrySize()); - - long[] out = new long[header.entrySize()]; - workArea.buffer.get(0, out, 0, out.length); - - return new IndexJournalReadEntry(header, out); - - } - - public long docId() { - return header.combinedId(); - } - - public long docMeta() { - return header.documentMeta(); - } - - public int domainId() { - return UrlIdCodec.getDomainId(docId()); - } - - public IndexJournalEntryData readEntry() { - return new IndexJournalEntryData(header.entrySize(), buffer); - } - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java deleted file mode 100644 index 14e686b3..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReader.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.function.LongConsumer; -import java.util.function.LongPredicate; - -/** Tools for reading the index journal. */ -public interface IndexJournalReader { - int FILE_HEADER_SIZE_LONGS = 2; - int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS; - - /** Create a reader for a single file. */ - static IndexJournalReader singleFile(Path fileName) throws IOException { - return new IndexJournalReaderSingleFile(fileName); - } - - /** Create a reader for a set of files. */ - static IndexJournalReader paging(Path baseDir) throws IOException { - return new IndexJournalReaderPagingImpl(baseDir); - } - - default void forEachWordId(LongConsumer consumer) { - var ptr = this.newPointer(); - while (ptr.nextDocument()) { - while (ptr.nextRecord()) { - consumer.accept(ptr.wordId()); - } - } - } - - default void forEachDocId(LongConsumer consumer) { - var ptr = this.newPointer(); - while (ptr.nextDocument()) { - consumer.accept(ptr.documentId()); - } - } - - /** Create a new pointer to the journal. The IndexJournalPointer is - * a two-tiered iterator that allows both iteration over document records - * and their keywords - */ - IndexJournalPointer newPointer(); - - /** Reader that filters the entries based on the term metadata. */ - default IndexJournalReader filtering(LongPredicate termMetaFilter) { - return new FilteringIndexJournalReader(this, termMetaFilter); - } - -} - -class FilteringIndexJournalReader implements IndexJournalReader { - private final IndexJournalReader base; - private final LongPredicate termMetaFilter; - - FilteringIndexJournalReader(IndexJournalReader base, LongPredicate termMetaFilter) { - this.base = base; - this.termMetaFilter = termMetaFilter; - } - - @Override - public IndexJournalPointer newPointer() { - return base - .newPointer() - .filterWordMeta(termMetaFilter); - } -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java deleted file mode 100644 index d5ba23b8..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderPagingImpl.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class IndexJournalReaderPagingImpl implements IndexJournalReader { - - private static final Logger logger = LoggerFactory.getLogger(IndexJournalReaderPagingImpl.class); - private final List readers; - - public IndexJournalReaderPagingImpl(Path baseDir) throws IOException { - var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir); - if (inputFiles.isEmpty()) - logger.warn("Creating paging index journal file in {}, found no inputs!", baseDir); - else - logger.info("Creating paging index journal reader for {} inputs", inputFiles.size()); - - this.readers = new ArrayList<>(inputFiles.size()); - - for (var inputFile : inputFiles) { - readers.add(new IndexJournalReaderSingleFile(inputFile)); - } - } - - @Override - public IndexJournalPointer newPointer() { - return IndexJournalPointer.concatenate( - readers.stream() - .map(IndexJournalReader::newPointer) - .toArray(IndexJournalPointer[]::new) - ); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java deleted file mode 100644 index a131a788..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/IndexJournalReaderSingleFile.java +++ /dev/null @@ -1,130 +0,0 @@ -package nu.marginalia.index.journal.reader; - -import com.github.luben.zstd.ZstdInputStream; -import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalFileHeader; -import nu.marginalia.index.journal.reader.pointer.IndexJournalPointer; - -import java.io.*; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; - -public class IndexJournalReaderSingleFile implements IndexJournalReader { - - private Path journalFile; - public final IndexJournalFileHeader fileHeader; - - @Override - public String toString() { - return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }"; - } - - public IndexJournalReaderSingleFile(Path file) throws IOException { - this.journalFile = file; - - fileHeader = readHeader(file); - } - - private static IndexJournalFileHeader readHeader(Path file) throws IOException { - try (var raf = new RandomAccessFile(file.toFile(), "r")) { - long unused = raf.readLong(); - long wordCount = raf.readLong(); - - return new IndexJournalFileHeader(unused, wordCount); - } - } - - private static DataInputStream createInputStream(Path file) throws IOException { - var fileInputStream = Files.newInputStream(file, StandardOpenOption.READ); - - // skip the header - fileInputStream.skipNBytes(16); - - return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream))); - } - - @SneakyThrows - @Override - public IndexJournalPointer newPointer() { - return new SingleFileJournalPointer(fileHeader, createInputStream(journalFile)); - } - -} - -class SingleFileJournalPointer implements IndexJournalPointer { - - private final IndexJournalFileHeader fileHeader; - private final DataInputStream dataInputStream; - private IndexJournalReadEntry entry; - private IndexJournalEntryData entryData; - private int recordIdx = -2; - private int docIdx = -1; - - public SingleFileJournalPointer( - IndexJournalFileHeader fileHeader, - DataInputStream dataInputStream) - { - this.fileHeader = fileHeader; - this.dataInputStream = dataInputStream; - } - - @SneakyThrows - @Override - public boolean nextDocument() { - recordIdx = -2; - entryData = null; - - if (++docIdx < fileHeader.fileSizeRecords()) { - entry = IndexJournalReadEntry.read(dataInputStream); - return true; - } - - dataInputStream.close(); - - return false; - } - - @Override - public boolean nextRecord() { - if (entryData == null) { - entryData = entry.readEntry(); - } - - recordIdx += 2; - if (recordIdx < entryData.size()) { - return true; - } - return false; - } - - @Override - public long documentId() { - return entry.docId(); - } - - @Override - public long documentMeta() { - return entry.docMeta(); - } - - @Override - public long wordId() { - return entryData.get(recordIdx); - } - - @Override - public long wordMeta() { - return entryData.get(recordIdx + 1); - } - - @Override - public int documentFeatures() { - if (entryData == null) { - entryData = entry.readEntry(); - } - - return entry.header.documentFeatures(); - } -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java b/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java deleted file mode 100644 index 37100335..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/reader/pointer/IndexJournalPointer.java +++ /dev/null @@ -1,167 +0,0 @@ -package nu.marginalia.index.journal.reader.pointer; - -import java.util.function.LongPredicate; - -/** - * This is something like a double iterator. The Index Journal consists of - * blocks of words and word-metadata for each document and document metadata. - *
- * - * Perhaps best conceptualized as something like - * - *
[doc1: word1 word2 word3 word4] [doc2: word1 word2 word3 ]
- * nextDocument() will move the pointer from doc1 to doc2;
- * nextRecord() will move the pointer from word1 to word2...
- */ -public interface IndexJournalPointer { - /** - * Advance to the next document in the journal, - * returning true if such a document exists. - * Resets the record index to before the first - * record (if it exists). - */ - boolean nextDocument(); - - /** - * Advance to the next record in the journal - */ - boolean nextRecord(); - - /** - * Get the id associated with the current document - */ - long documentId(); - - /** - * Get the metadata associated with the current document - */ - long documentMeta(); - - /** - * Get the wordId associated with the current record - */ - long wordId(); - - /** - * Get the termMeta associated with the current record - */ - long wordMeta(); - - /** - * Get the documentFeatures associated with the current record - */ - int documentFeatures(); - - /** Concatenate a number of journal pointers */ - static IndexJournalPointer concatenate(IndexJournalPointer... pointers) { - if (pointers.length == 1) - return pointers[0]; - - return new JoiningJournalPointer(pointers); - } - - /** Add a filter on word metadata to the pointer */ - default IndexJournalPointer filterWordMeta(LongPredicate filter) { - return new FilteringJournalPointer(this, filter); - } -} - -class JoiningJournalPointer implements IndexJournalPointer { - private final IndexJournalPointer[] pointers; - private int pIndex = 0; - - JoiningJournalPointer(IndexJournalPointer[] pointers) { - this.pointers = pointers; - } - - @Override - public boolean nextDocument() { - - while (pIndex < pointers.length) { - if (pointers[pIndex].nextDocument()) - return true; - else pIndex++; - } - - return false; - } - - @Override - public boolean nextRecord() { - return pointers[pIndex].nextRecord(); - } - - @Override - public long documentId() { - return pointers[pIndex].documentId(); - } - - @Override - public long documentMeta() { - return pointers[pIndex].documentMeta(); - } - - @Override - public long wordId() { - return pointers[pIndex].wordId(); - } - - @Override - public long wordMeta() { - return pointers[pIndex].wordMeta(); - } - - @Override - public int documentFeatures() { - return pointers[pIndex].documentFeatures(); - } -} - -class FilteringJournalPointer implements IndexJournalPointer { - private final IndexJournalPointer base; - private final LongPredicate filter; - - FilteringJournalPointer(IndexJournalPointer base, LongPredicate filter) { - this.base = base; - this.filter = filter; - } - - @Override - public boolean nextDocument() { - return base.nextDocument(); - } - - @Override - public boolean nextRecord() { - while (base.nextRecord()) { - if (filter.test(wordMeta())) - return true; - } - return false; - } - - @Override - public long documentId() { - return base.documentId(); - } - - @Override - public long documentMeta() { - return base.documentMeta(); - } - - @Override - public long wordId() { - return base.wordId(); - } - - @Override - public long wordMeta() { - return base.wordMeta(); - } - - @Override - public int documentFeatures() { - return base.documentFeatures(); - } -} \ No newline at end of file diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java deleted file mode 100644 index 9d6966ef..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriter.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.index.journal.writer; - -import nu.marginalia.index.journal.model.IndexJournalEntry; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; - -import java.io.IOException; - -/** Responsible for writing to the index journal. - *

- * @see IndexJournalWriterSingleFileImpl - * @see IndexJournalWriterPagingImpl - */ -public interface IndexJournalWriter extends AutoCloseable { - /** Write an entry to the journal. - * - * @param header the header of the entry - * @param entry the data of the entry - * - * @return the number of bytes written - */ - int put(IndexJournalEntryHeader header, IndexJournalEntryData entry); - default int put(IndexJournalEntry entry) { - return put(entry.header(), entry.data()); - } - - void close() throws IOException; - -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java deleted file mode 100644 index 81d9de1e..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterPagingImpl.java +++ /dev/null @@ -1,67 +0,0 @@ -package nu.marginalia.index.journal.writer; - -import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Path; - -/** IndexJournalWriter implementation that creates a sequence of journal files, - * delegating to IndexJournalWriterSingleFileImpl to write the individual files. - * - */ -public class IndexJournalWriterPagingImpl implements IndexJournalWriter { - private final Path outputDir; - private int fileNumber = 0; - - /** The maximum size of a journal file, in uncompressed bytes. - * This should be safely below 2 GB, since we assume in the construction - * of the index that this is the case! The smaller these files are, the - * slower the index construction will be, but at the same time, if 2 GB - * is exceeded, the index construction will *quietly* fail. - * - * Flap flap, Icarus! - */ - private static final long sizeLimitBytes = 1_000_000_000; // 1 GB - - - private final Logger logger = LoggerFactory.getLogger(getClass()); - private IndexJournalWriter currentWriter = null; - private long bytesWritten = 0; - - public IndexJournalWriterPagingImpl(Path outputDir) throws IOException { - this.outputDir = outputDir; - switchToNextWriter(); - - logger.info("Creating Journal Writer {}", outputDir); - } - - private void switchToNextWriter() throws IOException { - if (currentWriter != null) - currentWriter.close(); - - currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++)); - } - - @Override - @SneakyThrows - public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { - if (bytesWritten >= sizeLimitBytes) { - bytesWritten = 0; - switchToNextWriter(); - } - - int writtenNow = currentWriter.put(header, entry); - bytesWritten += writtenNow; - - return writtenNow; - } - - public void close() throws IOException { - currentWriter.close(); - } -} diff --git a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java b/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java deleted file mode 100644 index beadb30a..00000000 --- a/code/index/index-journal/java/nu/marginalia/index/journal/writer/IndexJournalWriterSingleFileImpl.java +++ /dev/null @@ -1,135 +0,0 @@ -package nu.marginalia.index.journal.writer; - -import com.github.luben.zstd.ZstdDirectBufferCompressingStream; -import lombok.SneakyThrows; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.*; -import java.nio.ByteBuffer; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.nio.file.attribute.PosixFilePermissions; - -/** IndexJournalWriter implementation that creates a single journal file */ -public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{ - - private static final int ZSTD_BUFFER_SIZE = 8192; - private static final int DATA_BUFFER_SIZE = 8192; - - private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE); - - private final ZstdDirectBufferCompressingStream compressingStream; - private final FileChannel fileChannel; - - private int numEntries = 0; - private boolean closed = false; - - private final Logger logger = LoggerFactory.getLogger(getClass()); - - public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException { - - logger.info("Creating Journal Writer {}", outputFile); - - Files.deleteIfExists(outputFile); - Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--"))); - - fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE, - StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING); - - writeHeaderPlaceholder(fileChannel); - - compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) { - protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException { - toFlush.flip(); - while (toFlush.hasRemaining()) { - fileChannel.write(toFlush); - } - toFlush.clear(); - - return toFlush; - } - }; - } - - /** The file has a non-compressed header at the beginning of the file. - * Write a placeholder first to reserve the bytes, and position the - * channel after the header - */ - private static void writeHeaderPlaceholder(FileChannel fileStream) throws IOException { - var buffer = ByteBuffer.allocate(IndexJournalReader.FILE_HEADER_SIZE_BYTES); - - buffer.position(0); - buffer.limit(buffer.capacity()); - - while (buffer.hasRemaining()) - fileStream.write(buffer, buffer.position()); - - fileStream.position(IndexJournalReader.FILE_HEADER_SIZE_BYTES); - } - - @Override - @SneakyThrows - public int put(IndexJournalEntryHeader header, IndexJournalEntryData entry) { - if (dataBuffer.capacity() - dataBuffer.position() < 3*8) { - dataBuffer.flip(); - compressingStream.compress(dataBuffer); - dataBuffer.clear(); - } - - dataBuffer.putInt(entry.size()); - dataBuffer.putInt(header.documentFeatures()); - dataBuffer.putLong(header.combinedId()); - dataBuffer.putLong(header.documentMeta()); - - for (int i = 0; i < entry.size(); ) { - int remaining = (dataBuffer.capacity() - dataBuffer.position()) / 8; - if (remaining <= 0) { - dataBuffer.flip(); - compressingStream.compress(dataBuffer); - dataBuffer.clear(); - } - else while (remaining-- > 0 && i < entry.size()) { - - dataBuffer.putLong(entry.underlyingArray[i++]); - } - } - - numEntries++; - - final int bytesWritten = 8 * ( /*header = 3 longs */ 3 + entry.size()); - - return bytesWritten; - } - - public void close() throws IOException { - if (closed) - return; - else - closed = true; - - dataBuffer.flip(); - compressingStream.compress(dataBuffer); - dataBuffer.clear(); - compressingStream.flush(); - compressingStream.close(); - - - // Finalize the file by writing a header in the beginning - ByteBuffer header = ByteBuffer.allocate(16); - header.putLong(numEntries); - header.putLong(0); // reserved for future use - header.flip(); - - while (header.position() < header.limit()) { - fileChannel.write(header, header.position()); - } - - fileChannel.close(); - } -} diff --git a/code/index/index-journal/readme.md b/code/index/index-journal/readme.md index af7059b3..4f6b3360 100644 --- a/code/index/index-journal/readme.md +++ b/code/index/index-journal/readme.md @@ -6,19 +6,13 @@ This journal is written by [processes/loading-process](../../processes/loading-p when constructing the [forward](../index-forward) and [reverse](../index-reverse) indices. -The journal format is a file header, followed by a zstd-compressed list of entries, -each containing a header with document-level data, and a data section -with keyword-level data. +The journal uses the [Slop library](https://github.com/MarginaliaSearch/SlopData) to store data +in a columnar fashion. -The journal data may be split into multiple files, and the journal writers and readers -are designed to handle this transparently via their *Paging* implementation. +The journal will may be split into multiple files to help index +construction, as a merge strategy is used to reduce the amount +of RAM required during index construction. -## Central Classes - -### Model -* [IndexJournalEntry](java/nu/marginalia/index/journal/model/IndexJournalEntry.java) -* [IndexJournalEntryHeader](java/nu/marginalia/index/journal/model/IndexJournalEntryHeader.java) -* [IndexJournalEntryData](java/nu/marginalia/index/journal/model/IndexJournalEntryData.java) -### I/O -* [IndexJournalReader](java/nu/marginalia/index/journal/reader/IndexJournalReader.java) -* [IndexJournalWriter](java/nu/marginalia/index/journal/writer/IndexJournalWriter.java) \ No newline at end of file +Unlike most slop data stores, the index journal allows direct access +to the underlying columns, as the needs of the index construction processes +are fairly varied. \ No newline at end of file diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java deleted file mode 100644 index 47e8ac7f..00000000 --- a/code/index/index-journal/test/nu/marginalia/index/journal/IndexJournalTest.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.index.journal; - -import nu.marginalia.index.journal.model.IndexJournalEntry; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; -import nu.marginalia.model.id.UrlIdCodec; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -public class IndexJournalTest { - Path tempFile; - IndexJournalReader reader; - - long firstDocId = UrlIdCodec.encodeId(44, 10); - long secondDocId = UrlIdCodec.encodeId(43, 15); - - @BeforeEach - public void setUp() throws IOException { - tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat"); - - var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile); - journalWriter.put(IndexJournalEntry.builder(44, 10, 55) - .add(1, 2) - .add(2, 3) - .add(3, 4) - .add(5, 6).build()); - - journalWriter.put(IndexJournalEntry.builder(43, 15, 10) - .add(5, 5) - .add(6, 6) - .build()); - journalWriter.close(); - - reader = new IndexJournalReaderSingleFile(tempFile); - } - @AfterEach - public void tearDown() throws IOException { - Files.delete(tempFile); - } - - @Test - public void forEachDocId() { - List expected = List.of(firstDocId, secondDocId); - List actual = new ArrayList<>(); - - reader.forEachDocId(actual::add); - assertEquals(expected, actual); - } - - @Test - public void forEachWordId() { - List expected = List.of(1, 2, 3, 5, 5 ,6); - List actual = new ArrayList<>(); - - reader.forEachWordId(i -> actual.add((int) i)); - assertEquals(expected, actual); - } - -} diff --git a/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java b/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java deleted file mode 100644 index 202a229c..00000000 --- a/code/index/index-journal/test/nu/marginalia/index/journal/reader/pointer/IndexJournalPointerTest.java +++ /dev/null @@ -1,133 +0,0 @@ -package nu.marginalia.index.journal.reader.pointer; - -import org.junit.jupiter.api.Test; - -import java.util.Collection; -import java.util.List; -import java.util.ArrayList; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class IndexJournalPointerTest { - - @Test - public void concatenate() { - MockPointer left = new MockPointer( - List.of(new MockDocument(1, 2, 3, List.of( - new MockRecord(4, 5), - new MockRecord(6, 7)) - )) - ); - - MockPointer right = new MockPointer( - List.of(new MockDocument(8, 9, 10, List.of( - new MockRecord(11, 12), - new MockRecord(13, 14)) - )) - ); - - IndexJournalPointer concatenated = IndexJournalPointer.concatenate(left, right); - List docIdsSeq = new ArrayList<>(); - List wordIdsSeq = new ArrayList<>(); - while (concatenated.nextDocument()) { - docIdsSeq.add(concatenated.documentId()); - while (concatenated.nextRecord()) { - wordIdsSeq.add(concatenated.wordId()); - } - } - - assertEquals(docIdsSeq, List.of(1L, 8L)); - assertEquals(wordIdsSeq, List.of(4L, 6L, 11L, 13L)); - } - - @Test - public void filter() { - MockPointer left = new MockPointer( - List.of(new MockDocument(1, 2, 3, List.of( - new MockRecord(1, 1), - new MockRecord(2, 2), - new MockRecord(3, 3), - new MockRecord(4, 4), - new MockRecord(5, 5) - ) - ), new MockDocument(2, 2, 3, List.of( - new MockRecord(1, 1), - new MockRecord(3, 3), - new MockRecord(5, 5) - ) - )) - - ); - var filtered = left.filterWordMeta(meta -> (meta % 2) == 0); - - List docIdsSeq = new ArrayList<>(); - List wordIdsSeq = new ArrayList<>(); - while (filtered.nextDocument()) { - docIdsSeq.add(filtered.documentId()); - while (filtered.nextRecord()) { - wordIdsSeq.add(filtered.wordId()); - } - } - - assertEquals(docIdsSeq, List.of(1L, 2L)); - assertEquals(wordIdsSeq, List.of(2L, 4L)); - } - - class MockPointer implements IndexJournalPointer { - private final List documents; - - int di = -1; - int ri; - - public MockPointer(Collection documents) { - this.documents = new ArrayList<>(documents); - } - - @Override - public boolean nextDocument() { - if (++di < documents.size()) { - ri = -1; - return true; - } - - return false; - } - - @Override - public boolean nextRecord() { - if (++ri < documents.get(di).records.size()) { - return true; - } - - return false; - } - - @Override - public long documentId() { - return documents.get(di).docId; - } - - @Override - public long documentMeta() { - return documents.get(di).docMeta; - } - - @Override - public long wordId() { - return documents.get(di).records.get(ri).wordId; - } - - @Override - public long wordMeta() { - return documents.get(di).records.get(ri).wordMeta; - } - - @Override - public int documentFeatures() { - return documents.get(di).docFeatures; - } - } - - record MockDocument(long docId, long docMeta, int docFeatures, List records) {} - record MockRecord(long wordId, long wordMeta) {} -} \ No newline at end of file diff --git a/code/index/index-reverse/build.gradle b/code/index/index-reverse/build.gradle index bd46b3a0..bd0831ba 100644 --- a/code/index/index-reverse/build.gradle +++ b/code/index/index-reverse/build.gradle @@ -16,19 +16,26 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') implementation project(':code:libraries:btree') + implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:random-write-funnel') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') + implementation project(':code:processes:converting-process:model') implementation project(':code:common:process') + implementation project(':third-party:parquet-floor') + implementation project(':third-party:commons-codec') + implementation libs.bundles.slf4j + implementation libs.slop implementation libs.fastutil testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + testImplementation project(':code:libraries:test-helpers') } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/FullIndexEntrySource.java similarity index 83% rename from code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java rename to code/index/index-reverse/java/nu/marginalia/index/FullIndexEntrySource.java index f10ddb1c..3f572f15 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexEntrySource.java +++ b/code/index/index-reverse/java/nu/marginalia/index/FullIndexEntrySource.java @@ -6,7 +6,7 @@ import nu.marginalia.index.query.EntrySource; import static java.lang.Math.min; -public class ReverseIndexEntrySource implements EntrySource { +public class FullIndexEntrySource implements EntrySource { private final String name; private final BTreeReader reader; @@ -16,10 +16,10 @@ public class ReverseIndexEntrySource implements EntrySource { final int entrySize; private final long wordId; - public ReverseIndexEntrySource(String name, - BTreeReader reader, - int entrySize, - long wordId) { + public FullIndexEntrySource(String name, + BTreeReader reader, + int entrySize, + long wordId) { this.name = name; this.reader = reader; this.entrySize = entrySize; @@ -36,6 +36,7 @@ public class ReverseIndexEntrySource implements EntrySource { @Override public void read(LongQueryBuffer buffer) { + buffer.reset(); buffer.end = min(buffer.end, endOffset - pos); reader.readData(buffer.data, buffer.end, pos); pos += buffer.end; diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/FullReverseIndexReader.java similarity index 77% rename from code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java rename to code/index/index-reverse/java/nu/marginalia/index/FullReverseIndexReader.java index 72feb7fd..15b7b7ce 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexReader.java +++ b/code/index/index-reverse/java/nu/marginalia/index/FullReverseIndexReader.java @@ -3,6 +3,8 @@ package nu.marginalia.index; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.positions.TermData; +import nu.marginalia.index.positions.PositionsFileReader; import nu.marginalia.index.query.EmptyEntrySource; import nu.marginalia.index.query.EntrySource; import nu.marginalia.index.query.ReverseIndexRejectFilter; @@ -14,12 +16,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.foreign.Arena; import java.nio.file.Files; import java.nio.file.Path; -import java.util.Arrays; import java.util.concurrent.Executors; -public class ReverseIndexReader { +public class FullReverseIndexReader { private final LongArray words; private final LongArray documents; private final long wordsDataOffset; @@ -27,9 +29,16 @@ public class ReverseIndexReader { private final BTreeReader wordsBTreeReader; private final String name; - public ReverseIndexReader(String name, Path words, Path documents) throws IOException { + private final PositionsFileReader positionsFileReader; + + public FullReverseIndexReader(String name, + Path words, + Path documents, + PositionsFileReader positionsFileReader) throws IOException { this.name = name; + this.positionsFileReader = positionsFileReader; + if (!Files.exists(words) || !Files.exists(documents)) { this.words = null; this.documents = null; @@ -92,7 +101,7 @@ public class ReverseIndexReader { if (offset < 0) // No documents return new EmptyEntrySource(); - return new ReverseIndexEntrySource(name, createReaderNew(offset), 2, termId); + return new FullIndexEntrySource(name, createReaderNew(offset), 2, termId); } /** Create a filter step requiring the specified termId to exist in the documents */ @@ -129,35 +138,35 @@ public class ReverseIndexReader { private BTreeReader createReaderNew(long offset) { return new BTreeReader( documents, - ReverseIndexParameters.docsBTreeContext, + ReverseIndexParameters.fullDocsBTreeContext, offset); } - public long[] getTermMeta(long termId, long[] docIds) { + public TermData[] getTermData(Arena arena, + long termId, + long[] docIds) + { + var ret = new TermData[docIds.length]; + long offset = wordOffset(termId); if (offset < 0) { // This is likely a bug in the code, but we can't throw an exception here logger.debug("Missing offset for word {}", termId); - return new long[docIds.length]; + return ret; } - assert isUniqueAndSorted(docIds) : "The input array docIds is assumed to be unique and sorted, was " + Arrays.toString(docIds); - var reader = createReaderNew(offset); - return reader.queryData(docIds, 1); - } - private boolean isUniqueAndSorted(long[] ids) { - if (ids.length == 0) - return true; + // Read the size and offset of the position data + var offsets = reader.queryData(docIds, 1); - for (int i = 1; i < ids.length; i++) { - if(ids[i] <= ids[i-1]) - return false; + for (int i = 0; i < docIds.length; i++) { + if (offsets[i] == 0) + continue; + ret[i] = positionsFileReader.getTermData(arena, offsets[i]); } - - return true; + return ret; } public void close() { @@ -166,5 +175,14 @@ public class ReverseIndexReader { if (words != null) words.close(); + + if (positionsFileReader != null) { + try { + positionsFileReader.close(); + } catch (IOException e) { + logger.error("Failed to close positions file reader", e); + } + } } + } diff --git a/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java b/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java new file mode 100644 index 00000000..e3b93d44 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/PrioIndexEntrySource.java @@ -0,0 +1,144 @@ +package nu.marginalia.index; + +import lombok.SneakyThrows; +import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.index.query.EntrySource; +import nu.marginalia.sequence.io.BitReader; +import nu.marginalia.model.id.UrlIdCodec; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; + +public class PrioIndexEntrySource implements EntrySource { + private final String name; + + private final ByteBuffer readData = ByteBuffer.allocate(1024); + private final BitReader bitReader = new BitReader(readData, this::fillReadBuffer); + + private final FileChannel docsFileChannel; + private long dataOffsetStartB; + private final long wordId; + + private final int numItems; + private int readItems = 0; + + int prevRank = -1; + int prevDomainId = -1; + int prevDocOrd = -1; + + public PrioIndexEntrySource(String name, + FileChannel docsFileChannel, + long dataOffsetStartB, + long wordId) + { + this.name = name; + this.docsFileChannel = docsFileChannel; + this.dataOffsetStartB = dataOffsetStartB; + this.wordId = wordId; + + // sneaky read of the header to get item count upfront + + try { + readData.limit(4); + + int rb = docsFileChannel.read(readData, dataOffsetStartB); + assert rb == 4; + readData.flip(); + numItems = readData.getInt() & 0x3FFF_FFFF; + + readData.position(0); + readData.limit(0); + } + catch (IOException ex) { + throw new IllegalStateException("Failed to read index data.", ex); + } + } + + @Override + public void skip(int n) { + throw new UnsupportedOperationException("Not implemented"); + } + + @Override + @SneakyThrows + @SuppressWarnings("preview") + public void read(LongQueryBuffer buffer) { + var outputBuffer = buffer.asByteBuffer().order(ByteOrder.LITTLE_ENDIAN); + outputBuffer.clear(); + + while (outputBuffer.hasRemaining() && readItems++ < numItems) { + int rank; + int domainId; + int docOrd; + + int code = bitReader.get(2); + if (code == 0b11) { + // header + bitReader.get(30); // skip 30 bits for the size header + + rank = bitReader.get(7); + domainId = bitReader.get(31); + docOrd = bitReader.get(26); + } + else if (code == 0b10) { + rank = prevRank + bitReader.getGamma(); + domainId = bitReader.get(31); + docOrd = bitReader.get(26); + } + else if (code == 0b01) { + rank = prevRank; + domainId = bitReader.getDelta() + prevDomainId; + docOrd = bitReader.getDelta() - 1; + } + else if (code == 0b00) { + rank = prevRank; + domainId = prevDomainId; + docOrd = prevDocOrd + bitReader.getGamma(); + } + else { + throw new IllegalStateException("??? found code " + code); + } + + long encodedId = UrlIdCodec.encodeId(rank, domainId, docOrd); + + outputBuffer.putLong( + encodedId + ); + + prevRank = rank; + prevDomainId = domainId; + prevDocOrd = docOrd; + } + + buffer.end = outputBuffer.position() / 8; + + buffer.uniq(); + } + + private void fillReadBuffer() { + try { + readData.compact(); + int rb = docsFileChannel.read(readData, dataOffsetStartB); + if (rb > 0) { + dataOffsetStartB += rb; + } + readData.flip(); + } + catch (IOException ex) { + throw new IllegalStateException("Failed to read index data.", ex); + } + } + + @Override + public boolean hasMore() { + return readItems < numItems; + } + + + @Override + public String indexName() { + return name + ":" + Long.toHexString(wordId); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java new file mode 100644 index 00000000..bf1214e5 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/PrioReverseIndexReader.java @@ -0,0 +1,113 @@ +package nu.marginalia.index; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.btree.BTreeReader; +import nu.marginalia.index.query.EmptyEntrySource; +import nu.marginalia.index.query.EntrySource; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; + +public class PrioReverseIndexReader { + private final LongArray words; + private final long wordsDataOffset; + private final Logger logger = LoggerFactory.getLogger(getClass()); + private final BTreeReader wordsBTreeReader; + private final String name; + + private final FileChannel documentsChannel; + + public PrioReverseIndexReader(String name, + Path words, + Path documents) throws IOException { + this.name = name; + + if (!Files.exists(words) || !Files.exists(documents)) { + this.words = null; + this.wordsBTreeReader = null; + this.documentsChannel = null; + this.wordsDataOffset = -1; + return; + } + + logger.info("Switching reverse index"); + + this.words = LongArrayFactory.mmapForReadingShared(words); + + wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0); + wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs(); + + documentsChannel = (FileChannel) Files.newByteChannel(documents); + } + + /** Calculate the offset of the word in the documents. + * If the return-value is negative, the term does not exist + * in the index. + */ + long wordOffset(long termId) { + long idx = wordsBTreeReader.findEntry(termId); + + if (idx < 0) + return -1L; + + return words.get(wordsDataOffset + idx + 1); + } + + public EntrySource documents(long termId) { + if (null == words) { + logger.warn("Reverse index is not ready, dropping query"); + return new EmptyEntrySource(); + } + + long offset = wordOffset(termId); + + if (offset < 0) // No documents + return new EmptyEntrySource(); + + return new PrioIndexEntrySource(name, + documentsChannel, + offset, + termId); + } + + /** Return the number of documents with the termId in the index */ + public int numDocuments(long termId) { + + long offset = wordOffset(termId); + + if (offset < 0) // No documents + return 0; + + ByteBuffer buffer = ByteBuffer.allocate(4); + try { + documentsChannel.read(buffer, offset); + } + catch (IOException e) { + logger.error("Failed to read documents channel", e); + return 0; + } + + return buffer.getInt(0) & 0x3FFF_FFFF; + + } + + + public void close() { + try { + documentsChannel.close(); + } + catch (IOException e) { + logger.error("Failed to close documents channel", e); + } + + if (words != null) + words.close(); + } + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexFullFileNames.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexFullFileNames.java index 3d0f2499..f7daff13 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexFullFileNames.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexFullFileNames.java @@ -13,16 +13,21 @@ public class ReverseIndexFullFileNames { case NEXT -> basePath.resolve("rev-docs.dat.next"); case CURRENT -> basePath.resolve("rev-docs.dat"); }; + case POSITIONS -> switch (version) { + case NEXT -> basePath.resolve("rev-positions.dat.next"); + case CURRENT -> basePath.resolve("rev-positions.dat"); + }; }; } public enum FileVersion { CURRENT, - NEXT + NEXT, } public enum FileIdentifier { WORDS, - DOCS + DOCS, + POSITIONS, } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java index a6df15d3..6de56e0c 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexParameters.java @@ -5,6 +5,7 @@ import nu.marginalia.btree.model.BTreeContext; public class ReverseIndexParameters { - public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); + public static final BTreeContext prioDocsBTreeContext = new BTreeContext(5, 1, BTreeBlockSize.BS_2048); + public static final BTreeContext fullDocsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048); } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexPrioFileNames.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexPrioFileNames.java index e99841d4..ff924cf1 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexPrioFileNames.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexPrioFileNames.java @@ -23,6 +23,6 @@ public class ReverseIndexPrioFileNames { public enum FileIdentifier { WORDS, - DOCS + DOCS, } } diff --git a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java index 61dee824..06251aca 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java +++ b/code/index/index-reverse/java/nu/marginalia/index/ReverseIndexSelfTest.java @@ -22,7 +22,7 @@ public class ReverseIndexSelfTest { public static void runSelfTest2(LongArray wordsDataRange, LongArray documents) { logger.info("Starting test 2"); for (long i = 1; i < wordsDataRange.size(); i+=2) { - var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); + var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i)); var header = docsBTreeReader.getHeader(); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); @@ -49,7 +49,7 @@ public class ReverseIndexSelfTest { public static void runSelfTest4(LongArray wordsDataRange, LongArray documents) { logger.info("Starting test 4"); for (long i = 1; i < wordsDataRange.size(); i+=2) { - var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); + var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i)); var header = docsBTreeReader.getHeader(); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); for (int j = 0; j < docRange.size(); j+=2) { @@ -84,7 +84,7 @@ public class ReverseIndexSelfTest { public static void runSelfTest6(LongArray wordsDataRange, LongArray documents) { logger.info("Starting test 6"); for (long i = 1; i < wordsDataRange.size(); i+=2) { - var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordsDataRange.get(i)); + var docsBTreeReader = new BTreeReader(documents, ReverseIndexParameters.fullDocsBTreeContext, wordsDataRange.get(i)); var header = docsBTreeReader.getHeader(); var docRange = documents.range(header.dataOffsetLongs(), header.dataOffsetLongs() + header.numEntries() * 2L); Long prev = null; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java b/code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java deleted file mode 100644 index b565206d..00000000 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/JournalReaderSource.java +++ /dev/null @@ -1,10 +0,0 @@ -package nu.marginalia.index.construction; - -import nu.marginalia.index.journal.reader.IndexJournalReader; - -import java.io.IOException; -import java.nio.file.Path; - -public interface JournalReaderSource { - IndexJournalReader construct(Path sourceFile) throws IOException; -} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java new file mode 100644 index 00000000..808e03fd --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/PositionsFileConstructor.java @@ -0,0 +1,76 @@ +package nu.marginalia.index.construction; + +import nu.marginalia.index.positions.PositionCodec; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +/** A class for constructing a positions file. This class is thread-safe. + * + *

+ * + * The positions data is concatenated in the file, with each term's metadata + * followed by its positions. The metadata is a single byte, and the positions + * are encoded using the Elias Gamma code, with zero padded bits at the end to + * get octet alignment. + * + *

+ * + * It is the responsibility of the caller to keep track of the byte offset of + * each posting in the file. + */ +public class PositionsFileConstructor implements AutoCloseable { + private final ByteBuffer workBuffer = ByteBuffer.allocate(65536); + + private final Path file; + private final FileChannel channel; + + private long offset; + + public PositionsFileConstructor(Path file) throws IOException { + this.file = file; + + channel = FileChannel.open(file, StandardOpenOption.CREATE, StandardOpenOption.WRITE); + } + + /** Add a term to the positions file + * @param termMeta the term metadata + * @param positionsBuffer the positions of the term + * @return the offset of the term in the file, with the size of the data in the highest byte + */ + public long add(byte termMeta, ByteBuffer positionsBuffer) throws IOException { + synchronized (file) { + int size = 1 + positionsBuffer.remaining(); + + if (workBuffer.remaining() < size) { + workBuffer.flip(); + channel.write(workBuffer); + workBuffer.clear(); + } + + workBuffer.put(termMeta); + workBuffer.put(positionsBuffer); + + long ret = PositionCodec.encode(size, offset); + + offset += size; + + return ret; + } + } + + public void close() throws IOException { + if (workBuffer.hasRemaining()) { + workBuffer.flip(); + + while (workBuffer.hasRemaining()) + channel.write(workBuffer); + } + + channel.force(false); + channel.close(); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java deleted file mode 100644 index 7a925679..00000000 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexConstructor.java +++ /dev/null @@ -1,107 +0,0 @@ -package nu.marginalia.index.construction; - -import lombok.SneakyThrows; -import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.concurrent.atomic.AtomicInteger; - -public class ReverseIndexConstructor { - - private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class); - - public enum CreateReverseIndexSteps { - CONSTRUCT, - FINALIZE, - FINISHED - } - - private final Path outputFileDocs; - private final Path outputFileWords; - private final JournalReaderSource readerSource; - private final DocIdRewriter docIdRewriter; - private final Path tmpDir; - - public ReverseIndexConstructor(Path outputFileDocs, - Path outputFileWords, - JournalReaderSource readerSource, - DocIdRewriter docIdRewriter, - Path tmpDir) { - this.outputFileDocs = outputFileDocs; - this.outputFileWords = outputFileWords; - this.readerSource = readerSource; - this.docIdRewriter = docIdRewriter; - this.tmpDir = tmpDir; - } - - public void createReverseIndex(ProcessHeartbeat processHeartbeat, - String processName, - Path sourceBaseDir) throws IOException - { - var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir); - if (inputs.isEmpty()) { - logger.error("No journal files in base dir {}", sourceBaseDir); - return; - } - - try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName)) { - - heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); - - try (var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes")) { - - AtomicInteger progress = new AtomicInteger(0); - inputs - .parallelStream() - .map(in -> { - preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), inputs.size()); - return construct(in); - }) - .reduce(this::merge) - .ifPresent((index) -> { - heartbeat.progress(CreateReverseIndexSteps.FINALIZE); - finalizeIndex(index); - heartbeat.progress(CreateReverseIndexSteps.FINISHED); - }); - } - heartbeat.progress(CreateReverseIndexSteps.FINISHED); - } - } - - @SneakyThrows - private ReversePreindexReference construct(Path input) { - return ReversePreindex - .constructPreindex(readerSource.construct(input), docIdRewriter, tmpDir) - .closeToReference(); - } - - @SneakyThrows - private ReversePreindexReference merge(ReversePreindexReference leftR, ReversePreindexReference rightR) { - - var left = leftR.open(); - var right = rightR.open(); - - try { - return ReversePreindex.merge(tmpDir, left, right).closeToReference(); - } - finally { - left.delete(); - right.delete(); - } - - - } - - @SneakyThrows - private void finalizeIndex(ReversePreindexReference finalPR) { - var finalP = finalPR.open(); - finalP.finalizeIndex(outputFileDocs, outputFileWords); - finalP.delete(); - } - - -} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java deleted file mode 100644 index 0f232577..00000000 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexDocuments.java +++ /dev/null @@ -1,131 +0,0 @@ -package nu.marginalia.index.construction; - -import lombok.SneakyThrows; -import nu.marginalia.array.LongArray; -import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.rwf.RandomFileAssembler; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.IOException; -import java.nio.channels.FileChannel; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.StandardOpenOption; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; - -/** A LongArray with document data, segmented according to - * the associated ReversePreindexWordSegments data - */ -public class ReversePreindexDocuments { - final Path file; - public final LongArray documents; - private static final int RECORD_SIZE_LONGS = 2; - private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class); - - public ReversePreindexDocuments(LongArray documents, Path file) { - this.documents = documents; - this.file = file; - } - - public static ReversePreindexDocuments construct( - Path docsFile, - Path workDir, - IndexJournalReader reader, - DocIdRewriter docIdRewriter, - ReversePreindexWordSegments segments) throws IOException { - - createUnsortedDocsFile(docsFile, workDir, reader, segments, docIdRewriter); - - LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); - sortDocsFile(docsFileMap, segments); - - return new ReversePreindexDocuments(docsFileMap, docsFile); - } - - public FileChannel createDocumentsFileChannel() throws IOException { - return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ); - } - - - public LongArray slice(long start, long end) { - return documents.range(start, end); - } - - public long size() { - return documents.size(); - } - - private static void createUnsortedDocsFile(Path docsFile, - Path workDir, - IndexJournalReader reader, - ReversePreindexWordSegments segments, - DocIdRewriter docIdRewriter) throws IOException { - - long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); - - try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs)) { - - var offsetMap = segments.asMap(RECORD_SIZE_LONGS); - offsetMap.defaultReturnValue(0); - - var pointer = reader.newPointer(); - while (pointer.nextDocument()) { - long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId()); - while (pointer.nextRecord()) { - long wordId = pointer.wordId(); - long wordMeta = pointer.wordMeta(); - - long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS); - - assembly.put(offset + 0, rankEncodedId); - assembly.put(offset + 1, wordMeta); - } - } - - assembly.write(docsFile); - } - } - - @SneakyThrows - private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments) throws IOException { - - var iter = segments.iterator(RECORD_SIZE_LONGS); - - ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors()); - - while (iter.next()) { - long iterStart = iter.startOffset; - long iterEnd = iter.endOffset; - - if (iter.size() < 1024) { - docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd); - } - else { - sortingWorkers.execute(() -> - docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd)); - } - } - - sortingWorkers.shutdown(); - while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS)); - - sortingWorkers.close(); - } - - public void delete() throws IOException { - Files.delete(this.file); - documents.close(); - } - - public void close() { - documents.close(); - } - - public void force() { - documents.force(); - } -} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java similarity index 57% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java index dd5499bf..0af6165e 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReverseIndexBTreeTransformer.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexBTreeTransformer.java @@ -1,4 +1,4 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArray; import nu.marginalia.array.algo.LongArrayTransformations; @@ -6,25 +6,23 @@ import nu.marginalia.btree.BTreeWriter; import nu.marginalia.btree.model.BTreeContext; import java.io.IOException; -import java.nio.channels.FileChannel; /** Constructs the BTrees in a reverse index */ -public class ReverseIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer { +public class FullIndexBTreeTransformer implements LongArrayTransformations.LongIOTransformer { private final BTreeWriter writer; - private final FileChannel intermediateChannel; - private final int entrySize; + private final LongArray documentsArray; long start = 0; long writeOffset = 0; - public ReverseIndexBTreeTransformer(LongArray urlsFileMap, - int entrySize, - BTreeContext bTreeContext, - FileChannel intermediateChannel) { + public FullIndexBTreeTransformer(LongArray urlsFileMap, + int entrySize, + BTreeContext bTreeContext, + LongArray documentsArray) { + this.documentsArray = documentsArray; this.writer = new BTreeWriter(urlsFileMap, bTreeContext); this.entrySize = entrySize; - this.intermediateChannel = intermediateChannel; } @Override @@ -39,7 +37,7 @@ public class ReverseIndexBTreeTransformer implements LongArrayTransformations.Lo final long offsetForBlock = writeOffset; writeOffset += writer.write(writeOffset, size, - mapRegion -> mapRegion.transferFrom(intermediateChannel, start, 0, end - start) + mapRegion -> mapRegion.transferFrom(documentsArray, start, 0, end - start) ); start = end; diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java new file mode 100644 index 00000000..a3b25669 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullIndexConstructor.java @@ -0,0 +1,113 @@ +package nu.marginalia.index.construction.full; + +import lombok.SneakyThrows; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.process.control.ProcessHeartbeat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.concurrent.atomic.AtomicInteger; + +public class FullIndexConstructor { + + private static final Logger logger = LoggerFactory.getLogger(FullIndexConstructor.class); + + public enum CreateReverseIndexSteps { + CONSTRUCT, + FINALIZE, + FINISHED + } + + private final Path outputFileDocs; + private final Path outputFileWords; + private final Path outputFilePositions; + private final DocIdRewriter docIdRewriter; + private final Path tmpDir; + + public FullIndexConstructor(Path outputFileDocs, + Path outputFileWords, + Path outputFilePositions, + DocIdRewriter docIdRewriter, + Path tmpDir) { + this.outputFileDocs = outputFileDocs; + this.outputFileWords = outputFileWords; + this.outputFilePositions = outputFilePositions; + this.docIdRewriter = docIdRewriter; + this.tmpDir = tmpDir; + } + + public void createReverseIndex(ProcessHeartbeat processHeartbeat, + String processName, + Path sourceBaseDir) throws IOException + { + var journal = IndexJournal.findJournal(sourceBaseDir); + if (journal.isEmpty()) { + logger.error("No journal files in base dir {}", sourceBaseDir); + return; + } + + try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName); + var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes"); + var posConstructor = new PositionsFileConstructor(outputFilePositions) + ) { + heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); + + AtomicInteger progress = new AtomicInteger(0); + + var journalVersions = journal.get().pages(); + + journalVersions + .parallelStream() + .map(in -> { + preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size()); + return construct(in, posConstructor); + }) + .reduce(this::merge) + .ifPresent((index) -> { + heartbeat.progress(CreateReverseIndexSteps.FINALIZE); + finalizeIndex(index); + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + }); + + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + } + } + + @SneakyThrows + private FullPreindexReference construct(IndexJournalPage journalInstance, PositionsFileConstructor positionsFileConstructor) { + return FullPreindex + .constructPreindex(journalInstance, positionsFileConstructor, docIdRewriter, tmpDir) + .closeToReference(); + } + + @SneakyThrows + private FullPreindexReference merge(FullPreindexReference leftR, FullPreindexReference rightR) { + + var left = leftR.open(); + var right = rightR.open(); + + try { + return FullPreindex.merge(tmpDir, left, right).closeToReference(); + } + finally { + left.delete(); + right.delete(); + } + + + } + + @SneakyThrows + private void finalizeIndex(FullPreindexReference finalPR) { + var finalP = finalPR.open(); + finalP.finalizeIndex(outputFileDocs, outputFileWords); + finalP.delete(); + } + + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java similarity index 64% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java index ac39e817..57100fa9 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindex.java @@ -1,15 +1,18 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.btree.BTreeWriter; import nu.marginalia.index.ReverseIndexParameters; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.construction.CountToOffsetTransformer; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.IndexSizeEstimator; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.journal.IndexJournalPage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; @@ -25,13 +28,13 @@ import static nu.marginalia.array.algo.TwoArrayOperations.*; * the union of their data. This operation requires no additional * RAM. */ -public class ReversePreindex { - final ReversePreindexWordSegments segments; - final ReversePreindexDocuments documents; +public class FullPreindex { + final FullPreindexWordSegments segments; + final FullPreindexDocuments documents; - private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class); + private static final Logger logger = LoggerFactory.getLogger(FullPreindex.class); - public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) { + public FullPreindex(FullPreindexWordSegments segments, FullPreindexDocuments documents) { this.segments = segments; this.documents = documents; } @@ -39,26 +42,27 @@ public class ReversePreindex { /** Constructs a new preindex with the data associated with reader. The backing files * will have randomly assigned names. */ - public static ReversePreindex constructPreindex(IndexJournalReader reader, - DocIdRewriter docIdRewriter, - Path workDir) throws IOException + public static FullPreindex constructPreindex(IndexJournalPage journalInstance, + PositionsFileConstructor positionsFileConstructor, + DocIdRewriter docIdRewriter, + Path workDir) throws IOException { Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat"); Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); - var segments = ReversePreindexWordSegments.construct(reader, segmentWordsFile, segmentCountsFile); - var docs = ReversePreindexDocuments.construct(docsFile, workDir, reader, docIdRewriter, segments); - return new ReversePreindex(segments, docs); + var segments = FullPreindexWordSegments.construct(journalInstance, segmentWordsFile, segmentCountsFile); + var docs = FullPreindexDocuments.construct(docsFile, workDir, journalInstance, docIdRewriter, positionsFileConstructor, segments); + return new FullPreindex(segments, docs); } /** Close the associated memory mapped areas and return - * a dehydrated version of this object that can be re-opened + * a dehydrated page of this object that can be re-opened * later. */ - public ReversePreindexReference closeToReference() { + public FullPreindexReference closeToReference() { try { - return new ReversePreindexReference(segments, documents); + return new FullPreindexReference(segments, documents); } finally { segments.force(); @@ -77,18 +81,15 @@ public class ReversePreindex { // Estimate the size of the docs index data offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2)); - IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2); + IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.fullDocsBTreeContext, 2); offsets.fold(0, 0, offsets.size(), sizeEstimator); // Write the docs file LongArray finalDocs = LongArrayFactory.mmapForWritingConfined(outputFileDocs, sizeEstimator.size); - try (var intermediateDocChannel = documents.createDocumentsFileChannel()) { - offsets.transformEachIO(0, offsets.size(), - new ReverseIndexBTreeTransformer(finalDocs, 2, - ReverseIndexParameters.docsBTreeContext, - intermediateDocChannel)); - intermediateDocChannel.force(false); - } + offsets.transformEachIO(0, offsets.size(), + new FullIndexBTreeTransformer(finalDocs, 2, + ReverseIndexParameters.fullDocsBTreeContext, + documents.documents)); LongArray wordIds = segments.wordIds; @@ -125,11 +126,11 @@ public class ReversePreindex { documents.delete(); } - public static ReversePreindex merge(Path destDir, - ReversePreindex left, - ReversePreindex right) throws IOException { + public static FullPreindex merge(Path destDir, + FullPreindex left, + FullPreindex right) throws IOException { - ReversePreindexWordSegments mergingSegment = + FullPreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir, left.segments, right.segments); var mergingIter = mergingSegment.constructionIterator(2); @@ -143,42 +144,36 @@ public class ReversePreindex { leftIter.next(); rightIter.next(); - try (FileChannel leftChannel = left.documents.createDocumentsFileChannel(); - FileChannel rightChannel = right.documents.createDocumentsFileChannel()) + while (mergingIter.canPutMore() + && leftIter.isPositionBeforeEnd() + && rightIter.isPositionBeforeEnd()) { + final long currentWord = mergingIter.wordId; - while (mergingIter.canPutMore() - && leftIter.isPositionBeforeEnd() - && rightIter.isPositionBeforeEnd()) + if (leftIter.wordId == currentWord && rightIter.wordId == currentWord) { - final long currentWord = mergingIter.wordId; - - if (leftIter.wordId == currentWord && rightIter.wordId == currentWord) - { - // both inputs have documents for the current word - mergeSegments(leftIter, rightIter, - left.documents, right.documents, - mergedDocuments, mergingIter); - } - else if (leftIter.wordId == currentWord) { - if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter)) - break; - } - else if (rightIter.wordId == currentWord) { - if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter)) - break; - } - else assert false : "This should never happen"; // the helvetica scenario + // both inputs have documents for the current word + mergeSegments(leftIter, rightIter, + left.documents, right.documents, + mergedDocuments, mergingIter); } - - if (leftIter.isPositionBeforeEnd()) { - while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter)); + else if (leftIter.wordId == currentWord) { + if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments)) + break; } - - if (rightIter.isPositionBeforeEnd()) { - while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter)); + else if (rightIter.wordId == currentWord) { + if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments)) + break; } + else assert false : "This should never happen"; // the helvetica scenario + } + if (leftIter.isPositionBeforeEnd()) { + while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments)); + } + + if (rightIter.isPositionBeforeEnd()) { + while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments)); } if (leftIter.isPositionBeforeEnd()) @@ -197,18 +192,18 @@ public class ReversePreindex { mergedDocuments = shrinkMergedDocuments(mergedDocuments, docsFile, 2 * mergingSegment.totalSize()); - return new ReversePreindex( + return new FullPreindex( mergingSegment, - new ReversePreindexDocuments(mergedDocuments, docsFile) + new FullPreindexDocuments(mergedDocuments, docsFile) ); } /** Create a segment word file with each word from both inputs, with zero counts for all the data. * This is an intermediate product in merging. */ - static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir, - ReversePreindexWordSegments left, - ReversePreindexWordSegments right) throws IOException { + static FullPreindexWordSegments createMergedSegmentWordFile(Path destDir, + FullPreindexWordSegments left, + FullPreindexWordSegments right) throws IOException { Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); @@ -227,7 +222,7 @@ public class ReversePreindex { LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize); - return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile); + return new FullPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile); } /** It's possible we overestimated the necessary size of the documents file, @@ -238,7 +233,7 @@ public class ReversePreindex { mergedDocuments.force(); long beforeSize = mergedDocuments.size(); - long afterSize = sizeLongs * 8; + long afterSize = sizeLongs; if (beforeSize != afterSize) { mergedDocuments.close(); try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) { @@ -255,12 +250,12 @@ public class ReversePreindex { /** Merge contents of the segments indicated by leftIter and rightIter into the destionation * segment, and advance the construction iterator with the appropriate size. */ - private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter, - ReversePreindexWordSegments.SegmentIterator rightIter, - ReversePreindexDocuments left, - ReversePreindexDocuments right, + private static void mergeSegments(FullPreindexWordSegments.SegmentIterator leftIter, + FullPreindexWordSegments.SegmentIterator rightIter, + FullPreindexDocuments left, + FullPreindexDocuments right, LongArray dest, - ReversePreindexWordSegments.SegmentConstructionIterator destIter) + FullPreindexWordSegments.SegmentConstructionIterator destIter) { long segSize = mergeArrays2(dest, left.documents, @@ -278,16 +273,16 @@ public class ReversePreindex { /** Copy the data from the source segment at the position and length indicated by sourceIter, * into the destination segment, and advance the construction iterator. */ - private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter, - LongArray dest, - FileChannel sourceChannel, - ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException { + private static boolean copySegment(FullPreindexWordSegments.SegmentIterator sourceIter, + FullPreindexDocuments srcDocuments, + FullPreindexWordSegments.SegmentConstructionIterator mergingIter, + LongArray dest) throws IOException { long size = sourceIter.endOffset - sourceIter.startOffset; long start = mergingIter.startOffset; long end = start + size; - dest.transferFrom(sourceChannel, + dest.transferFrom(srcDocuments.documents, sourceIter.startOffset, mergingIter.startOffset, end); diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java new file mode 100644 index 00000000..02055c7f --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexDocuments.java @@ -0,0 +1,141 @@ +package nu.marginalia.index.construction.full; + +import lombok.SneakyThrows; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.rwf.RandomFileAssembler; +import nu.marginalia.slop.SlopTable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.List; + +/** A LongArray with document data, segmented according to + * the associated FullPreindexWordSegments data + */ +public class FullPreindexDocuments { + public final LongArray documents; + + private static PositionsFileConstructor positionsFileConstructor; + private static final int RECORD_SIZE_LONGS = 2; + private static final Logger logger = LoggerFactory.getLogger(FullPreindexDocuments.class); + + public final Path file; + + public FullPreindexDocuments(LongArray documents, Path file) { + this.documents = documents; + this.file = file; + } + + public static FullPreindexDocuments construct( + Path docsFile, + Path workDir, + IndexJournalPage journalInstance, + DocIdRewriter docIdRewriter, + PositionsFileConstructor positionsFileConstructor, + FullPreindexWordSegments segments) throws IOException { + FullPreindexDocuments.positionsFileConstructor = positionsFileConstructor; + + createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter); + + LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); + sortDocsFile(docsFileMap, segments); + + return new FullPreindexDocuments(docsFileMap, docsFile); + } + + public FileChannel createDocumentsFileChannel() throws IOException { + return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ); + } + + + public LongArray slice(long start, long end) { + return documents.range(start, end); + } + + public long size() { + return documents.size(); + } + + private static void createUnsortedDocsFile(Path docsFile, + Path workDir, + IndexJournalPage instance, + FullPreindexWordSegments segments, + DocIdRewriter docIdRewriter) throws IOException { + + long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); + + final ByteBuffer tempBuffer = ByteBuffer.allocate(1024*1024*100); + + try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); + var slopTable = new SlopTable(instance.baseDir(), instance.page())) + { + var docIds = instance.openCombinedId(slopTable); + var termIds = instance.openTermIds(slopTable); + var termMeta = instance.openTermMetadata(slopTable); + var positions = instance.openTermPositions(slopTable); + + var offsetMap = segments.asMap(RECORD_SIZE_LONGS); + offsetMap.defaultReturnValue(0); + + while (docIds.hasRemaining()) { + long docId = docIds.get(); + long rankEncodedId = docIdRewriter.rewriteDocId(docId); + + long[] tIds = termIds.get(); + byte[] tMeta = termMeta.get(); + tempBuffer.clear(); + List tPos = positions.getData(tempBuffer); + + for (int i = 0; i < tIds.length; i++) { + long termId = tIds[i]; + byte meta = tMeta[i]; + ByteBuffer pos = tPos.get(i); + + long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); + long encodedPosOffset = positionsFileConstructor.add(meta, pos); + + assembly.put(offset + 0, rankEncodedId); + assembly.put(offset + 1, encodedPosOffset); + } + } + + assembly.write(docsFile); + } + } + + @SneakyThrows + private static void sortDocsFile(LongArray docsFileMap, FullPreindexWordSegments segments) { + + var iter = segments.iterator(RECORD_SIZE_LONGS); + + while (iter.next()) { + long iterStart = iter.startOffset; + long iterEnd = iter.endOffset; + + docsFileMap.quickSortN(RECORD_SIZE_LONGS, iterStart, iterEnd); + } + } + + public void delete() throws IOException { + Files.delete(this.file); + documents.close(); + } + + public void close() { + documents.close(); + } + + public void force() { + documents.force(); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java similarity index 62% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java index 16c542d5..73bd03b2 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexReference.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexReference.java @@ -1,33 +1,33 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArrayFactory; import java.io.IOException; import java.nio.file.Path; -/** This is a dehydrated version of a ReversePreIndex, that only +/** This is a dehydrated page of a FullPreIndex, that only * keeps references to its location on disk but does not hold associated * memory maps. */ -public record ReversePreindexReference( +public record FullPreindexReference( Path wordsFile, Path countsFile, Path documentsFile ) { - public ReversePreindexReference(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) { + public FullPreindexReference(FullPreindexWordSegments segments, FullPreindexDocuments documents) { this(segments.wordsFile, segments.countsFile, documents.file); } - public ReversePreindex open() throws IOException { - return new ReversePreindex( - new ReversePreindexWordSegments( + public FullPreindex open() throws IOException { + return new FullPreindex( + new FullPreindexWordSegments( LongArrayFactory.mmapForModifyingShared(wordsFile), LongArrayFactory.mmapForModifyingShared(countsFile), wordsFile, countsFile ), - new ReversePreindexDocuments( + new FullPreindexDocuments( LongArrayFactory.mmapForModifyingShared(documentsFile), documentsFile ) diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java similarity index 80% rename from code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java rename to code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java index 0e6c32fb..0a4e39a7 100644 --- a/code/index/index-reverse/java/nu/marginalia/index/construction/ReversePreindexWordSegments.java +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/full/FullPreindexWordSegments.java @@ -1,30 +1,31 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.LongIterator; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.slop.SlopTable; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; /** A pair of file-backed arrays of sorted wordIds - * and the count of documents associated with each wordId. + * and the count of documents associated with each termId. */ -public class ReversePreindexWordSegments { +public class FullPreindexWordSegments { public final LongArray wordIds; public final LongArray counts; final Path wordsFile; final Path countsFile; - public ReversePreindexWordSegments(LongArray wordIds, - LongArray counts, - Path wordsFile, - Path countsFile) + public FullPreindexWordSegments(LongArray wordIds, + LongArray counts, + Path wordsFile, + Path countsFile) { assert wordIds.size() == counts.size(); @@ -34,7 +35,7 @@ public class ReversePreindexWordSegments { this.countsFile = countsFile; } - /** Returns a long-long hash map where each key is a wordId, + /** Returns a long-long hash map where each key is a termId, * and each value is the start offset of the data. */ public Long2LongOpenHashMap asMap(int recordSize) { @@ -51,14 +52,24 @@ public class ReversePreindexWordSegments { return ret; } - public static ReversePreindexWordSegments construct(IndexJournalReader reader, - Path wordIdsFile, - Path countsFile) + public static FullPreindexWordSegments construct(IndexJournalPage instance, + Path wordIdsFile, + Path countsFile) throws IOException { Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); countsMap.defaultReturnValue(0); - reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1)); + + try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) { + var termIds = instance.openTermIds(slopTable); + while (termIds.hasRemaining()) { + long[] tids = termIds.get(); + for (long termId : tids) { + countsMap.addTo(termId, 1); + } + } + } + LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size()); LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size()); @@ -79,7 +90,7 @@ public class ReversePreindexWordSegments { counts.set(i, countsMap.get(words.get(i))); } - return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile); + return new FullPreindexWordSegments(words, counts, wordIdsFile, countsFile); } public SegmentIterator iterator(int recordSize) { @@ -188,7 +199,7 @@ public class ReversePreindexWordSegments { if (i == fileSize) { // We've reached the end of the iteration and there is no - // "next" wordId to fetch + // "next" termId to fetch wordId = Long.MIN_VALUE; return false; } diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java new file mode 100644 index 00000000..3072ffb8 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioDocIdsTransformer.java @@ -0,0 +1,142 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.algo.LongArrayTransformations; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.sequence.io.BitWriter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; + +/** Constructs document ids list priority reverse index */ +public class PrioDocIdsTransformer implements LongArrayTransformations.LongIOTransformer, AutoCloseable { + + private static final Logger logger = LoggerFactory.getLogger(PrioDocIdsTransformer.class); + + private final FileChannel writeChannel; + private final FileChannel readChannel; + + private final ByteBuffer readBuffer = ByteBuffer.allocate(65536).order(ByteOrder.LITTLE_ENDIAN); + private final ByteBuffer writeBuffer = ByteBuffer.allocate(65536); + + long startL = 0; + long writeOffsetB = 0; + + public PrioDocIdsTransformer(FileChannel writeChannel, + FileChannel readChannel) { + this.writeChannel = writeChannel; + this.readChannel = readChannel; + } + + @Override + public long transform(long pos, long endL) throws IOException { + + final int sizeL = (int) ((endL - startL)); + final long startOffsetB = writeOffsetB; + + if (sizeL == 0) + throw new IllegalStateException("Empty range"); + + readChannel.position(startL * 8); + readBuffer.clear(); + + int toBeRead = 8 * (sizeL); + + var bitWriter = new BitWriter(writeBuffer); + + int prevRank = -1; + int prevDomainId = -1; + int prevDocOrd = -1; + boolean wroteHeader = false; + + do { + readBuffer.limit(Math.min(readBuffer.capacity(), toBeRead)); + readChannel.read(readBuffer); + readBuffer.flip(); + + if (!wroteHeader) { + // write 11b header + bitWriter.putBits(3, 2); + // encode number of items + bitWriter.putBits(sizeL, 30); + + + long firstItem = readBuffer.getLong(); + + prevRank = UrlIdCodec.getRank(firstItem); + prevDomainId = UrlIdCodec.getDomainId(firstItem); + prevDocOrd = UrlIdCodec.getDocumentOrdinal(firstItem); + + bitWriter.putBits(prevRank, 7); + bitWriter.putBits(prevDomainId, 31); + bitWriter.putBits(prevDocOrd, 26); + + wroteHeader = true; + } + + while (readBuffer.hasRemaining()) { + if (writeBuffer.remaining() < 16) { + writeBuffer.flip(); + int written = writeChannel.write(writeBuffer, writeOffsetB); + writeOffsetB += written; + writeBuffer.clear(); + } + + long nextId = readBuffer.getLong(); + + // break down id components + int rank = UrlIdCodec.getRank(nextId); + int domainId = UrlIdCodec.getDomainId(nextId); + int docOrd = UrlIdCodec.getDocumentOrdinal(nextId); + + // encode components + if (rank != prevRank) { + bitWriter.putBits(0b10, 2); + bitWriter.putGamma(rank - prevRank); + bitWriter.putBits(domainId, 31); + bitWriter.putBits(docOrd, 26); + } + else if (domainId != prevDomainId) { + bitWriter.putBits(0b01, 2); + bitWriter.putDelta(domainId - prevDomainId); + bitWriter.putDelta(1 + docOrd); + } + else if (docOrd != prevDocOrd) { + bitWriter.putBits(0b00, 2); + bitWriter.putGamma(docOrd - prevDocOrd); + } + else { + logger.warn("Unexpected duplicate document id: {}", nextId); + } + + prevDocOrd = docOrd; + prevDomainId = domainId; + prevRank = rank; + + } + + toBeRead -= readBuffer.limit(); + readBuffer.clear(); + } while (toBeRead > 0); + + // write lingering data + + // ensure any half-written data is flushed to the buffer + bitWriter.finishLastByte(); + + // update the start input pointer + startL = endL; + return startOffsetB; + } + + @Override + public void close() throws IOException { + writeBuffer.flip(); + int written = writeChannel.write(writeBuffer, writeOffsetB); + writeOffsetB += written; + writeBuffer.clear(); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java new file mode 100644 index 00000000..cddad7a4 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java @@ -0,0 +1,108 @@ +package nu.marginalia.index.construction.prio; + +import lombok.SneakyThrows; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.process.control.ProcessHeartbeat; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.concurrent.atomic.AtomicInteger; + +public class PrioIndexConstructor { + + private static final Logger logger = LoggerFactory.getLogger(PrioIndexConstructor.class); + + public enum CreateReverseIndexSteps { + CONSTRUCT, + FINALIZE, + FINISHED + } + + private final Path outputFileDocs; + private final Path outputFileWords; + private final DocIdRewriter docIdRewriter; + private final Path tmpDir; + + public PrioIndexConstructor(Path outputFileDocs, + Path outputFileWords, + DocIdRewriter docIdRewriter, + Path tmpDir) { + this.outputFileDocs = outputFileDocs; + this.outputFileWords = outputFileWords; + this.docIdRewriter = docIdRewriter; + this.tmpDir = tmpDir; + } + + public void createReverseIndex(ProcessHeartbeat processHeartbeat, + String processName, + Path sourceBaseDir) throws IOException + { + var journal = IndexJournal.findJournal(sourceBaseDir); + if (journal.isEmpty()) { + logger.error("No journal files in base dir {}", sourceBaseDir); + return; + } + + try (var heartbeat = processHeartbeat.createProcessTaskHeartbeat(CreateReverseIndexSteps.class, processName); + var preindexHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("constructPreindexes") + ) { + heartbeat.progress(CreateReverseIndexSteps.CONSTRUCT); + + AtomicInteger progress = new AtomicInteger(0); + + var journalVersions = journal.get().pages(); + + journalVersions + .parallelStream() + .map(in -> { + preindexHeartbeat.progress("PREINDEX/MERGE", progress.incrementAndGet(), journalVersions.size()); + return construct(in); + }) + .reduce(this::merge) + .ifPresent((index) -> { + heartbeat.progress(CreateReverseIndexSteps.FINALIZE); + finalizeIndex(index); + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + }); + + heartbeat.progress(CreateReverseIndexSteps.FINISHED); + } + } + + @SneakyThrows + private PrioPreindexReference construct(IndexJournalPage journalInstance) { + return PrioPreindex + .constructPreindex(journalInstance, docIdRewriter, tmpDir) + .closeToReference(); + } + + @SneakyThrows + private PrioPreindexReference merge(PrioPreindexReference leftR, PrioPreindexReference rightR) { + + var left = leftR.open(); + var right = rightR.open(); + + try { + return PrioPreindex.merge(tmpDir, left, right).closeToReference(); + } + finally { + left.delete(); + right.delete(); + } + + + } + + @SneakyThrows + private void finalizeIndex(PrioPreindexReference finalPR) { + var finalP = finalPR.open(); + finalP.finalizeIndex(outputFileDocs, outputFileWords); + finalP.delete(); + } + + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java new file mode 100644 index 00000000..3b971288 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindex.java @@ -0,0 +1,298 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.btree.BTreeWriter; +import nu.marginalia.index.ReverseIndexParameters; +import nu.marginalia.index.construction.CountToOffsetTransformer; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.journal.IndexJournalPage; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +import static nu.marginalia.array.algo.TwoArrayOperations.countDistinctElements; +import static nu.marginalia.array.algo.TwoArrayOperations.mergeArrays; + +/** Contains the data that would go into a reverse index, + * that is, a mapping from words to documents, minus the actual + * index structure that makes the data quick to access while + * searching. + *

+ * Two preindexes can be merged into a third preindex containing + * the union of their data. This operation requires no additional + * RAM. + */ +public class PrioPreindex { + final PrioPreindexWordSegments segments; + final PrioPreindexDocuments documents; + + private static final Logger logger = LoggerFactory.getLogger(PrioPreindex.class); + + public PrioPreindex(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) { + this.segments = segments; + this.documents = documents; + } + + /** Constructs a new preindex with the data associated with reader. The backing files + * will have randomly assigned names. + */ + public static PrioPreindex constructPreindex(IndexJournalPage indexJournalPage, + DocIdRewriter docIdRewriter, + Path workDir) throws IOException + { + Path segmentWordsFile = Files.createTempFile(workDir, "segment_words", ".dat"); + Path segmentCountsFile = Files.createTempFile(workDir, "segment_counts", ".dat"); + Path docsFile = Files.createTempFile(workDir, "docs", ".dat"); + + var segments = PrioPreindexWordSegments.construct(indexJournalPage, segmentWordsFile, segmentCountsFile); + var docs = PrioPreindexDocuments.construct(docsFile, workDir, indexJournalPage, docIdRewriter, segments); + return new PrioPreindex(segments, docs); + } + + /** Close the associated memory mapped areas and return + * a dehydrated page of this object that can be re-opened + * later. + */ + public PrioPreindexReference closeToReference() { + try { + return new PrioPreindexReference(segments, documents); + } + finally { + segments.force(); + documents.force(); + segments.close(); + documents.close(); + } + } + + /** Transform the preindex into a reverse index */ + public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException { + var offsets = segments.counts; + + Files.deleteIfExists(outputFileDocs); + Files.deleteIfExists(outputFileWords); + + // Estimate the size of the docs index data + offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(1)); + + // Write the docs file + try (var intermediateDocChannel = documents.createDocumentsFileChannel(); + var destFileChannel = (FileChannel) Files.newByteChannel(outputFileDocs, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE); + var transformer = new PrioDocIdsTransformer(destFileChannel, intermediateDocChannel) + ) { + offsets.transformEachIO(0, offsets.size(), transformer); + } + + LongArray wordIds = segments.wordIds; + + if (offsets.size() != wordIds.size()) + throw new IllegalStateException("Offsets and word-ids of different size"); + if (offsets.size() > Integer.MAX_VALUE) { + throw new IllegalStateException("offsets.size() too big!"); + } + + // Estimate the size of the words index data + long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size()); + + // Construct the tree + LongArray wordsArray = LongArrayFactory.mmapForWritingConfined(outputFileWords, wordsSize); + + new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext) + .write(0, (int) offsets.size(), mapRegion -> { + for (long i = 0; i < offsets.size(); i++) { + mapRegion.set(2*i, wordIds.get(i)); + mapRegion.set(2*i + 1, offsets.get(i)); + } + }); + + wordsArray.force(); + wordsArray.close(); + } + + /** Delete all files associated with this pre-index */ + public void delete() throws IOException { + segments.delete(); + documents.delete(); + } + + public static PrioPreindex merge(Path destDir, + PrioPreindex left, + PrioPreindex right) throws IOException { + + PrioPreindexWordSegments mergingSegment = + createMergedSegmentWordFile(destDir, left.segments, right.segments); + + var mergingIter = mergingSegment.constructionIterator(1); + var leftIter = left.segments.iterator(1); + var rightIter = right.segments.iterator(1); + + Path docsFile = Files.createTempFile(destDir, "docs", ".dat"); + + LongArray mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, left.documents.size() + right.documents.size()); + + leftIter.next(); + rightIter.next(); + + while (mergingIter.canPutMore() + && leftIter.isPositionBeforeEnd() + && rightIter.isPositionBeforeEnd()) + { + final long currentWord = mergingIter.wordId; + + if (leftIter.wordId == currentWord && rightIter.wordId == currentWord) + { + // both inputs have documents for the current word + mergeSegments(leftIter, rightIter, + left.documents, right.documents, + mergedDocuments, mergingIter); + } + else if (leftIter.wordId == currentWord) { + if (!copySegment(leftIter, left.documents, mergingIter, mergedDocuments)) + break; + } + else if (rightIter.wordId == currentWord) { + if (!copySegment(rightIter, right.documents, mergingIter, mergedDocuments)) + break; + } + else assert false : "This should never happen"; // the helvetica scenario + } + + if (leftIter.isPositionBeforeEnd()) { + while (copySegment(leftIter, left.documents, mergingIter, mergedDocuments)); + } + + if (rightIter.isPositionBeforeEnd()) { + while (copySegment(rightIter, right.documents, mergingIter, mergedDocuments)); + } + + + if (leftIter.isPositionBeforeEnd()) + throw new IllegalStateException("Left has more to go"); + if (rightIter.isPositionBeforeEnd()) + throw new IllegalStateException("Right has more to go"); + if (mergingIter.canPutMore()) + throw new IllegalStateException("Source iters ran dry before merging iter"); + + + mergingSegment.force(); + + // We may have overestimated the size of the merged docs size in the case there were + // duplicates in the data, so we need to shrink it to the actual size we wrote. + + mergedDocuments = shrinkMergedDocuments(mergedDocuments, + docsFile, mergingSegment.totalSize()); + + return new PrioPreindex( + mergingSegment, + new PrioPreindexDocuments(mergedDocuments, docsFile) + ); + } + + /** Create a segment word file with each word from both inputs, with zero counts for all the data. + * This is an intermediate product in merging. + */ + static PrioPreindexWordSegments createMergedSegmentWordFile(Path destDir, + PrioPreindexWordSegments left, + PrioPreindexWordSegments right) throws IOException { + Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); + Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); + + // We need total size to request a direct LongArray range. Seems slower, but is faster. + // ... see LongArray.directRangeIfPossible(long start, long end) + long segmentsSize = countDistinctElements(left.wordIds, right.wordIds, + 0, left.wordIds.size(), + 0, right.wordIds.size()); + + LongArray wordIdsFile = LongArrayFactory.mmapForWritingConfined(segmentWordsFile, segmentsSize); + + mergeArrays(wordIdsFile, left.wordIds, right.wordIds, + 0, + 0, left.wordIds.size(), + 0, right.wordIds.size()); + + LongArray counts = LongArrayFactory.mmapForWritingConfined(segmentCountsFile, segmentsSize); + + return new PrioPreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile); + } + + /** It's possible we overestimated the necessary size of the documents file, + * this will permit us to shrink it down to the smallest necessary size. + */ + private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException { + + mergedDocuments.force(); + + long beforeSize = mergedDocuments.size(); + long afterSize = sizeLongs; + if (beforeSize != afterSize) { + mergedDocuments.close(); + try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) { + bc.truncate(sizeLongs * 8); + } + + logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize); + mergedDocuments = LongArrayFactory.mmapForWritingConfined(docsFile, sizeLongs); + } + + return mergedDocuments; + } + + /** Merge contents of the segments indicated by leftIter and rightIter into the destionation + * segment, and advance the construction iterator with the appropriate size. + */ + private static void mergeSegments(PrioPreindexWordSegments.SegmentIterator leftIter, + PrioPreindexWordSegments.SegmentIterator rightIter, + PrioPreindexDocuments left, + PrioPreindexDocuments right, + LongArray dest, + PrioPreindexWordSegments.SegmentConstructionIterator destIter) + { + long segSize = mergeArrays(dest, + left.documents, + right.documents, + destIter.startOffset, + leftIter.startOffset, leftIter.endOffset, + rightIter.startOffset, rightIter.endOffset); + + destIter.putNext(segSize); + leftIter.next(); + rightIter.next(); + } + + /** Copy the data from the source segment at the position and length indicated by sourceIter, + * into the destination segment, and advance the construction iterator. + */ + /** Copy the data from the source segment at the position and length indicated by sourceIter, + * into the destination segment, and advance the construction iterator. + */ + private static boolean copySegment(PrioPreindexWordSegments.SegmentIterator sourceIter, + PrioPreindexDocuments srcDocuments, + PrioPreindexWordSegments.SegmentConstructionIterator mergingIter, + LongArray dest) throws IOException { + + long size = sourceIter.endOffset - sourceIter.startOffset; + long start = mergingIter.startOffset; + long end = start + size; + + dest.transferFrom(srcDocuments.documents, + sourceIter.startOffset, + mergingIter.startOffset, + end); + + boolean putNext = mergingIter.putNext(size); + boolean iterNext = sourceIter.next(); + + if (!putNext && iterNext) + throw new IllegalStateException("Source iterator ran out before dest iterator?!"); + + return iterNext; + } + + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java new file mode 100644 index 00000000..d9290e14 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexDocuments.java @@ -0,0 +1,125 @@ +package nu.marginalia.index.construction.prio; + +import lombok.SneakyThrows; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.rwf.RandomFileAssembler; +import nu.marginalia.slop.SlopTable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +/** A LongArray with document data, segmented according to + * the associated FullPreindexWordSegments data + */ +public class PrioPreindexDocuments { + public final LongArray documents; + + private static final int RECORD_SIZE_LONGS = 1; + private static final Logger logger = LoggerFactory.getLogger(PrioPreindexDocuments.class); + + public final Path file; + + public PrioPreindexDocuments(LongArray documents, Path file) { + this.documents = documents; + this.file = file; + } + + public static PrioPreindexDocuments construct( + Path docsFile, + Path workDir, + IndexJournalPage journalInstance, + DocIdRewriter docIdRewriter, + PrioPreindexWordSegments segments) throws IOException { + + createUnsortedDocsFile(docsFile, workDir, journalInstance, segments, docIdRewriter); + + LongArray docsFileMap = LongArrayFactory.mmapForModifyingShared(docsFile); + sortDocsFile(docsFileMap, segments); + + return new PrioPreindexDocuments(docsFileMap, docsFile); + } + + public FileChannel createDocumentsFileChannel() throws IOException { + return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ); + } + + + public long size() { + return documents.size(); + } + + private static void createUnsortedDocsFile(Path docsFile, + Path workDir, + IndexJournalPage instance, + PrioPreindexWordSegments segments, + DocIdRewriter docIdRewriter) throws IOException { + + long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize(); + + try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs); + var slopTable = new SlopTable(instance.baseDir(), instance.page())) + { + var docIds = instance.openCombinedId(slopTable); + var termIds = instance.openTermIds(slopTable); + var termMeta = instance.openTermMetadata(slopTable); + + var offsetMap = segments.asMap(RECORD_SIZE_LONGS); + offsetMap.defaultReturnValue(0); + + + while (docIds.hasRemaining()) { + long docId = docIds.get(); + long rankEncodedId = docIdRewriter.rewriteDocId(docId); + + long[] tIds = termIds.get(); + byte[] tMeta = termMeta.get(); + + for (int i = 0; i < tIds.length; i++) { + long termId = tIds[i]; + byte meta = tMeta[i]; + + if (meta != 0) { + long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS); + assembly.put(offset, rankEncodedId); + } + } + } + + assembly.write(docsFile); + } + } + + @SneakyThrows + private static void sortDocsFile(LongArray docsFileMap, PrioPreindexWordSegments segments) { + + var iter = segments.iterator(RECORD_SIZE_LONGS); + + while (iter.next()) { + long iterStart = iter.startOffset; + long iterEnd = iter.endOffset; + + docsFileMap.sort(iterStart, iterEnd); + } + } + + public void delete() throws IOException { + Files.delete(this.file); + documents.close(); + } + + public void close() { + documents.close(); + } + + public void force() { + documents.force(); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java new file mode 100644 index 00000000..f2ccd8df --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexReference.java @@ -0,0 +1,36 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.LongArrayFactory; + +import java.io.IOException; +import java.nio.file.Path; + +/** This is a dehydrated page of a PrioPreIndex, that only + * keeps references to its location on disk but does not hold associated + * memory maps. + */ +public record PrioPreindexReference( + Path wordsFile, + Path countsFile, + Path documentsFile +) +{ + public PrioPreindexReference(PrioPreindexWordSegments segments, PrioPreindexDocuments documents) { + this(segments.wordsFile, segments.countsFile, documents.file); + } + + public PrioPreindex open() throws IOException { + return new PrioPreindex( + new PrioPreindexWordSegments( + LongArrayFactory.mmapForModifyingShared(wordsFile), + LongArrayFactory.mmapForModifyingShared(countsFile), + wordsFile, + countsFile + ), + new PrioPreindexDocuments( + LongArrayFactory.mmapForModifyingShared(documentsFile), + documentsFile + ) + ); + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java new file mode 100644 index 00000000..69c5ea61 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/construction/prio/PrioPreindexWordSegments.java @@ -0,0 +1,221 @@ +package nu.marginalia.index.construction.prio; + +import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; +import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; +import it.unimi.dsi.fastutil.longs.LongIterator; +import nu.marginalia.array.LongArray; +import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.slop.SlopTable; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +/** A pair of file-backed arrays of sorted wordIds + * and the count of documents associated with each termId. + */ +public class PrioPreindexWordSegments { + public final LongArray wordIds; + public final LongArray counts; + + final Path wordsFile; + final Path countsFile; + + public PrioPreindexWordSegments(LongArray wordIds, + LongArray counts, + Path wordsFile, + Path countsFile) + { + assert wordIds.size() == counts.size(); + + this.wordIds = wordIds; + this.counts = counts; + this.wordsFile = wordsFile; + this.countsFile = countsFile; + } + + /** Returns a long-long hash map where each key is a termId, + * and each value is the start offset of the data. + */ + public Long2LongOpenHashMap asMap(int recordSize) { + if (wordIds.size() > Integer.MAX_VALUE) + throw new IllegalArgumentException("Cannot create a map with more than Integer.MAX_VALUE entries"); + + Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f); + var iter = iterator(recordSize); + + while (iter.next()) { + ret.put(iter.wordId, iter.startOffset); + } + + return ret; + } + + public static PrioPreindexWordSegments construct(IndexJournalPage instance, + Path wordIdsFile, + Path countsFile) + throws IOException + { + Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f); + countsMap.defaultReturnValue(0); + + try (var slopTable = new SlopTable(instance.baseDir(), instance.page())) { + var termIds = instance.openTermIds(slopTable); + var termMetas = instance.openTermMetadata(slopTable); + + while (termIds.hasRemaining()) { + long[] data = termIds.get(); + byte[] meta = termMetas.get(); + + for (int i = 0; i < data.length; i++) { + if (meta[i] != 0) { + countsMap.addTo(data[i], 1); + } + } + } + } + + LongArray words = LongArrayFactory.mmapForWritingConfined(wordIdsFile, countsMap.size()); + LongArray counts = LongArrayFactory.mmapForWritingConfined(countsFile, countsMap.size()); + + // Create the words file by iterating over the map and inserting them into + // the words file in whatever bizarro hash table order they appear in + long i = 0; + LongIterator iter = countsMap.keySet().iterator(); + while (iter.hasNext()) { + words.set(i++, iter.nextLong()); + } + + // Sort the words file + words.sort(0, counts.size()); + + // Populate the counts + for (i = 0; i < countsMap.size(); i++) { + counts.set(i, countsMap.get(words.get(i))); + } + + return new PrioPreindexWordSegments(words, counts, wordIdsFile, countsFile); + } + + public SegmentIterator iterator(int recordSize) { + return new SegmentIterator(recordSize); + } + public SegmentConstructionIterator constructionIterator(int recordSize) { + return new SegmentConstructionIterator(recordSize); + } + + public long totalSize() { + return counts.fold(0, 0, counts.size(), Long::sum); + } + + public void delete() throws IOException { + Files.delete(countsFile); + Files.delete(wordsFile); + + counts.close(); + wordIds.close(); + } + + public void force() { + counts.force(); + wordIds.force(); + } + + public void close() { + wordIds.close(); + counts.close(); + } + + public class SegmentIterator { + private final int recordSize; + private final long fileSize; + long wordId; + long startOffset = 0; + long endOffset = 0; + + private SegmentIterator(int recordSize) { + this.recordSize = recordSize; + this.fileSize = wordIds.size(); + } + + private long i = -1; + public long idx() { + return i; + } + public boolean next() { + if (++i >= fileSize) { + wordId = Long.MIN_VALUE; + return false; + } + + wordId = wordIds.get(i); + startOffset = endOffset; + endOffset = startOffset + recordSize * counts.get(i); + + return true; + } + + public boolean hasMorePositions() { + return i + 1 < wordIds.size(); + } + + public boolean isPositionBeforeEnd() { + return i < wordIds.size(); + } + + public long size() { + return endOffset - startOffset; + } + } + + class SegmentConstructionIterator { + private final int recordSize; + private final long fileSize; + long wordId; + long startOffset = 0; + long endOffset = 0; + + private SegmentConstructionIterator(int recordSize) { + this.recordSize = recordSize; + this.fileSize = wordIds.size(); + if (fileSize == 0) { + throw new IllegalArgumentException("Cannot construct zero-length word segment file"); + } + this.wordId = wordIds.get(0); + } + + private long i = 0; + public long idx() { + return i; + } + + public boolean putNext(long size) { + + if (i >= fileSize) + return false; + + endOffset = startOffset + recordSize * size; + counts.set(i, size); + startOffset = endOffset; + endOffset = -1; + + i++; + + if (i == fileSize) { + // We've reached the end of the iteration and there is no + // "next" termId to fetch + wordId = Long.MIN_VALUE; + return false; + } + else { + wordId = wordIds.get(i); + return true; + } + } + + public boolean canPutMore() { + return i < wordIds.size(); + } + } +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java new file mode 100644 index 00000000..9df63eec --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionCodec.java @@ -0,0 +1,25 @@ +package nu.marginalia.index.positions; + +/** A utility class for encoding and decoding position data offsets, + * the data is encoded by using the highest 16 bits to store the offset, + * and the remaining 48 bits to store the size of the data. + *

+ * This lets us address 256 TB of data, with up to 64 KB of position data for each term, + * which is ample headroom for both the size of the data and the number of positions. + * */ +public class PositionCodec { + + public static long encode(int length, long offset) { + assert decodeSize(offset) == 0 : "Offset must be less than 2^48"; + + return (long) length << 48 | offset; + } + + public static int decodeSize(long sizeEncodedOffset) { + return (int) ((sizeEncodedOffset & 0xFFFF_0000_0000_0000L) >>> 48); + } + public static long decodeOffset(long sizeEncodedOffset) { + return sizeEncodedOffset & 0x0000_FFFF_FFFF_FFFFL; + } + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java new file mode 100644 index 00000000..43418155 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/positions/PositionsFileReader.java @@ -0,0 +1,43 @@ +package nu.marginalia.index.positions; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.channels.FileChannel; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +public class PositionsFileReader implements AutoCloseable { + private final FileChannel positions; + private static final Logger logger = LoggerFactory.getLogger(PositionsFileReader.class); + + public PositionsFileReader(Path positionsFile) throws IOException { + this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ); + } + + /** Get the positions for a term in the index, as pointed out by the encoded offset; + * intermediate buffers are allocated from the provided arena allocator. */ + public TermData getTermData(Arena arena, long sizeEncodedOffset) { + int length = PositionCodec.decodeSize(sizeEncodedOffset); + long offset = PositionCodec.decodeOffset(sizeEncodedOffset); + + var segment = arena.allocate(length); + var buffer = segment.asByteBuffer(); + + try { + positions.read(buffer, offset); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return new TermData(buffer); + } + + @Override + public void close() throws IOException { + positions.close(); + } + +} diff --git a/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java b/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java new file mode 100644 index 00000000..737f10f1 --- /dev/null +++ b/code/index/index-reverse/java/nu/marginalia/index/positions/TermData.java @@ -0,0 +1,22 @@ +package nu.marginalia.index.positions; + +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.VarintCodedSequence; + +import java.nio.ByteBuffer; + +public class TermData { + private final ByteBuffer buffer; + + public TermData(ByteBuffer buffer) { + this.buffer = buffer; + } + + public byte flags() { + return buffer.get(0); + } + + public CodedSequence positions() { + return new VarintCodedSequence(buffer, 1, buffer.capacity()); + } +} diff --git a/code/index/index-reverse/readme.md b/code/index/index-reverse/readme.md index fcc4fcfc..0874bf8d 100644 --- a/code/index/index-reverse/readme.md +++ b/code/index/index-reverse/readme.md @@ -7,7 +7,10 @@ There are two tiers of this index. * A priority index which only indexes terms that are flagged with priority flags1. * A full index that indexes all terms. -The full index also provides access to term-level metadata, while the priority index is a binary index that only offers information about which documents has a specific word. +The full index also provides access to term-level metadata, while the priority index is +a binary index that only offers information about which documents has a specific word. + +The priority index is also compressed, while the full index at this point is not. [1] See WordFlags in [common/model](../../common/model/) and KeywordMetadata in [features-convert/keyword-extraction](../../features-convert/keyword-extraction). @@ -34,9 +37,16 @@ to form a finalized reverse index. ![Illustration of the data layout of the finalized index](index.svg) ## Central Classes -* [ReversePreindex](java/nu/marginalia/index/construction/ReversePreindex.java) intermediate reverse index state. -* [ReverseIndexConstructor](java/nu/marginalia/index/construction/ReverseIndexConstructor.java) constructs the index. -* [ReverseIndexReader](java/nu/marginalia/index/ReverseIndexReader.java) interrogates the index. +Full index: +* [FullPreindex](java/nu/marginalia/index/construction/full/FullPreindex.java) intermediate reverse index state. +* [FullIndexConstructor](java/nu/marginalia/index/construction/full/FullIndexConstructor.java) constructs the index. +* [FullReverseIndexReader](java/nu/marginalia/index/FullReverseIndexReader.java) interrogates the index. + +Prio index: +* [PrioPreindex](java/nu/marginalia/index/construction/prio/PrioPreindex.java) intermediate reverse index state. +* [PrioIndexConstructor](java/nu/marginalia/index/construction/prio/PrioIndexConstructor.java) constructs the index. +* [PrioIndexReader](java/nu/marginalia/index/PrioReverseIndexReader.java) interrogates the index. + ## See Also diff --git a/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java new file mode 100644 index 00000000..d77d2133 --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/FullReverseIndexReaderTest.java @@ -0,0 +1,119 @@ +package nu.marginalia.index; + +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.construction.full.FullPreindex; +import nu.marginalia.index.construction.full.TestJournalFactory; +import nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta; +import nu.marginalia.index.positions.PositionsFileReader; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +import static nu.marginalia.index.construction.full.TestJournalFactory.wm; +import static org.junit.jupiter.api.Assertions.*; + +class FullReverseIndexReaderTest { + TestJournalFactory journalFactory; + Path tempDir; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + tempDir = Files.createTempDirectory("sort"); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); + } + + @Test + public void testSimple() throws IOException { + + var indexReader = createIndex( + new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5)) + ); + + assertEquals(1, indexReader.numDocuments(termId("50"))); + + var positions = indexReader.getTermData(Arena.global(), termId("50"), new long[] { 100 }); + + assertEquals(1, positions.length); + assertNotNull(positions[0]); + assertEquals((byte) 51, positions[0].flags()); + assertEquals(IntList.of(1, 3, 5), positions[0].positions().values()); + + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50"))); + } + + + @Test + public void test2x2() throws IOException { + + var indexReader = createIndex( + new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)), + new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54)) + ); + + assertEquals(1, indexReader.numDocuments(termId("50"))); + assertEquals(2, indexReader.numDocuments(termId("51"))); + assertEquals(1, indexReader.numDocuments(termId("52"))); + + assertArrayEquals(new long[] { 100 }, readEntries(indexReader, termId("50"))); + assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, termId("51"))); + assertArrayEquals(new long[] { 101 }, readEntries(indexReader, termId("52"))); + + } + + private long[] readEntries(FullReverseIndexReader reader, long wordId) { + var es = reader.documents(wordId); + assertTrue(es.hasMore()); + LongQueryBuffer buffer = new LongQueryBuffer(4); + es.read(buffer); + assertFalse(es.hasMore()); + return buffer.copyData(); + } + + private FullReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { + var reader = journalFactory.createReader(scenario); + + Path posFile = tempDir.resolve("positions.dat"); + Path docsFile = tempDir.resolve("docs.dat"); + Path wordsFile = tempDir.resolve("words.dat"); + + try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) { + var preindex = FullPreindex.constructPreindex(reader, + positionsFileConstructor, + DocIdRewriter.identity(), tempDir); + preindex.finalizeIndex(docsFile, wordsFile); + preindex.delete(); + } + + return new FullReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile)); + + } +} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java new file mode 100644 index 00000000..6d512333 --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/PositionsFileReaderTest.java @@ -0,0 +1,63 @@ +package nu.marginalia.index; + +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.index.construction.PositionsFileConstructor; +import nu.marginalia.index.positions.PositionsFileReader; +import nu.marginalia.index.positions.TermData; +import nu.marginalia.sequence.VarintCodedSequence; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class PositionsFileReaderTest { + + Path file; + + @BeforeEach + void setUp() throws IOException { + file = Files.createTempFile("positions", "dat"); + } + @AfterEach + void tearDown() throws IOException { + Files.delete(file); + } + + @Test + void getTermData() throws IOException { + ByteBuffer workArea = ByteBuffer.allocate(8192); + long key1, key2, key3; + try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) { + key1 = constructor.add((byte) 43, VarintCodedSequence.generate(1, 2, 3).buffer()); + key2 = constructor.add((byte) 51, VarintCodedSequence.generate(2, 3, 5, 1000, 5000, 20241).buffer()); + key3 = constructor.add((byte) 61, VarintCodedSequence.generate(3, 5, 7).buffer()); + } + + System.out.println("key1: " + Long.toHexString(key1)); + System.out.println("key2: " + Long.toHexString(key2)); + System.out.println("key3: " + Long.toHexString(key3)); + + try (Arena arena = Arena.ofConfined(); + PositionsFileReader reader = new PositionsFileReader(file)) + { + TermData data1 = reader.getTermData(arena, key1); + assertEquals(43, data1.flags()); + assertEquals(IntList.of( 1, 2, 3), data1.positions().values()); + + TermData data2 = reader.getTermData(arena, key2); + assertEquals(51, data2.flags()); + assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values()); + + TermData data3 = reader.getTermData(arena, key3); + assertEquals(61, data3.flags()); + assertEquals(IntList.of(3, 5, 7), data3.positions().values()); + } + } +} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java index 6f612a06..359e9396 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexDebugTest.java @@ -26,7 +26,7 @@ public class ReverseIndexDebugTest { long wordOffset = wordsBTreeReader.findEntry(problemWord); assertTrue(wordOffset >= 0); - var docsReader = new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, wordOffset); + var docsReader = new BTreeReader(documents, ReverseIndexParameters.prioDocsBTreeContext, wordOffset); // We find problemDoc even though it doesn't exist in the document range long docOffset = docsReader.findEntry(problemDoc); diff --git a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java b/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java deleted file mode 100644 index 265864c4..00000000 --- a/code/index/index-reverse/test/nu/marginalia/index/ReverseIndexReaderTest.java +++ /dev/null @@ -1,104 +0,0 @@ -package nu.marginalia.index; - -import nu.marginalia.array.page.LongQueryBuffer; -import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.ReversePreindex; -import nu.marginalia.index.construction.TestJournalFactory; -import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -import static nu.marginalia.index.construction.TestJournalFactory.wm; -import static org.junit.jupiter.api.Assertions.*; - -class ReverseIndexReaderTest { - TestJournalFactory journalFactory; - Path tempDir; - - @BeforeEach - public void setUp() throws IOException { - journalFactory = new TestJournalFactory(); - - tempDir = Files.createTempDirectory("sort"); - } - - @AfterEach - public void tearDown() throws IOException { - journalFactory.clear(); - - List contents = new ArrayList<>(); - Files.list(tempDir).forEach(contents::add); - for (var tempFile : contents) { - Files.delete(tempFile); - } - Files.delete(tempDir); - } - - @Test - public void testSimple() throws IOException { - - var indexReader = createIndex( - new EntryDataWithWordMeta(100, 101, wm(50, 51)) - ); - - assertEquals(1, indexReader.numDocuments(50)); - - long[] meta = indexReader.getTermMeta(50, new long[] { 100 }); - assertArrayEquals(new long[] { 51 }, meta); - assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); - } - - @Test - public void test2x2() throws IOException { - - var indexReader = createIndex( - new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)), - new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54)) - ); - - assertEquals(1, indexReader.numDocuments(50)); - assertEquals(2, indexReader.numDocuments(51)); - assertEquals(1, indexReader.numDocuments(52)); - - assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 })); - assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50)); - - assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 })); - assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51)); - - assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 })); - assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52)); - - } - - private long[] readEntries(ReverseIndexReader reader, long wordId) { - var es = reader.documents(wordId); - assertTrue(es.hasMore()); - LongQueryBuffer buffer = new LongQueryBuffer(4); - es.read(buffer); - assertFalse(es.hasMore()); - return buffer.copyData(); - } - - private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException { - var reader = journalFactory.createReader(scenario); - var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); - - - Path docsFile = tempDir.resolve("docs.dat"); - Path wordsFile = tempDir.resolve("words.dat"); - - preindex.finalizeIndex(docsFile, wordsFile); - preindex.delete(); - - return new ReverseIndexReader("test", wordsFile, docsFile); - - } -} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexMergeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexMergeTest.java deleted file mode 100644 index 1a173d9a..00000000 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexMergeTest.java +++ /dev/null @@ -1,424 +0,0 @@ - -package nu.marginalia.index.construction; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; - -import static nu.marginalia.index.construction.TestJournalFactory.*; -import static org.junit.jupiter.api.Assertions.assertEquals; - -class ReversePreindexMergeTest { - TestJournalFactory journalFactory; - Path countsFile; - Path wordsIdFile; - Path docsFile; - Path tempDir; - - @BeforeEach - public void setUp() throws IOException { - journalFactory = new TestJournalFactory(); - - countsFile = Files.createTempFile("counts", ".dat"); - wordsIdFile = Files.createTempFile("words", ".dat"); - docsFile = Files.createTempFile("docs", ".dat"); - tempDir = Files.createTempDirectory("sort"); - } - - @AfterEach - public void tearDown() throws IOException { - journalFactory.clear(); - - Files.deleteIfExists(countsFile); - Files.deleteIfExists(wordsIdFile); - List contents = new ArrayList<>(); - Files.list(tempDir).forEach(contents::add); - for (var tempFile : contents) { - Files.delete(tempFile); - } - Files.delete(tempDir); - } - - public ReversePreindex runMergeScenario( - List leftData, - List rightData - ) throws IOException { - var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new)); - var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new)); - - var left = ReversePreindex.constructPreindex(reader1, DocIdRewriter.identity(), tempDir); - var right = ReversePreindex.constructPreindex(reader2, DocIdRewriter.identity(), tempDir); - return ReversePreindex.merge(tempDir, left, right); - } - - private List getData(ReversePreindex merged) { - var iter = merged.segments.iterator(2); - List actual = new ArrayList<>(); - while (iter.next()) { - long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; - merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data); - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, - data)); - } - return actual; - } - - @Test - public void testDocsMergeSingleNoOverlap() throws IOException { - - IdSequence docIds = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - IdSequence wordIds = new IdSequence(); - - var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); - var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); - - var merged = runMergeScenario( - leftSequence, - rightSequence - ); - - var actual = getData(merged); - - var expected = simulateMerge(leftSequence, rightSequence); - - System.out.println(actual); - assertEquals(expected, actual); - } - - @Test - public void testDocsMergeSingleOnlyOverlap() throws IOException { - - IdSequence docIds = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - IdSequence wordIds = new IdSequence(); - - var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique()))); - var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique()))); - - var merged = runMergeScenario( - leftSequence, - rightSequence - ); - - var actual = getData(merged); - - var expected = simulateMerge(leftSequence, rightSequence); - - System.out.println(actual); - assertEquals(expected, actual); - } - - @Test - public void testDocsMergeSingleOnlyOverlap2() throws IOException { - - long wid1 = 1; - long wid2 = 2; - IdSequence docIds = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - - var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), - wm(wid1, wordMetas.nextUnique()), - wm(wid2, wordMetas.nextUnique()) - )); - var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), - wm(wid1, wordMetas.nextUnique()), - wm(wid2, wordMetas.nextUnique()) - )); - - var merged = runMergeScenario( - leftSequence, - rightSequence - ); - - var actual = getData(merged); - - var expected = simulateMerge(leftSequence, rightSequence); - - System.out.println(actual); - assertEquals(expected, actual); - } - - @Test - public void testBadCase1() throws IOException { - long wordId = 0xF00F00BA3L; - - List leftSequence = List.of(new EntryDataWithWordMeta(40, 50, - wm(wordId, 5)) - ); - List rightSequence = List.of(new EntryDataWithWordMeta(41, 51, - wm(wordId, 3), - wm(wordId, 4)) - ); - - var mergedLR = runMergeScenario( - leftSequence, - rightSequence - ); - var mergedRL = runMergeScenario( - rightSequence, - leftSequence - ); - - var actualLR = getData(mergedLR); - var actualRL = getData(mergedRL); - - var expected = simulateMerge(leftSequence, rightSequence); - - assertEquals(actualLR, actualRL); - - if (!expected.equals(actualLR)) { - System.out.println("*fail*"); - System.out.println(leftSequence); - System.out.println(rightSequence); - } - else { - System.out.println("*pass*"); - } - - assertEquals(expected, actualLR); - - } - - @Test - public void testBadCase2() throws IOException { - long wordId = 100; - - List leftSequence = List.of( - new EntryDataWithWordMeta(1, 50, wm(wordId, 5)), - new EntryDataWithWordMeta(2, 50, wm(wordId, 5)) - - ); - List rightSequence = List.of( - new EntryDataWithWordMeta(3, 50, wm(wordId, 5)) - ); - - var mergedLR = runMergeScenario( - leftSequence, - rightSequence - ); - var mergedRL = runMergeScenario( - rightSequence, - leftSequence - ); - - var actualLR = getData(mergedLR); - var actualRL = getData(mergedRL); - - var expected = simulateMerge(leftSequence, rightSequence); - - assertEquals(actualLR, actualRL); - - if (!expected.equals(actualLR)) { - System.out.println("*fail*"); - System.out.println(leftSequence); - System.out.println(rightSequence); - } - else { - System.out.println("*pass*"); - } - - assertEquals(expected, actualLR); - - } - - @Test - public void testFuzz() throws IOException { - Random r = new Random(); - int maxDocs = 150; - int maxWords = 160; - int nIters = 1000; - - for (int i = 0; i < nIters; i++) { - int nLeft = 1 + r.nextInt(maxDocs); - int nRight = 1 + r.nextInt(maxDocs); - - IdSequence docIdsLeft = new IdSequence(); - IdSequence docIdsRight = new IdSequence(); - IdSequence docMetas = new IdSequence(); - IdSequence wordMetas = new IdSequence(); - IdSequence wordIds = new IdSequence(); - - List leftSequence = new ArrayList<>(nLeft); - for (int j = 0; j < nLeft; j++) { - WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)]; - Arrays.setAll(words, idx -> { - long wordId = wordIds.seenWithP(1.0); - long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId); - return wm(wordId, wordMeta); - }); - - long docId = docIdsLeft.nextUnique(); - long docMeta = docMetas.nextUniqueAssociatedWithKey(docId); - leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words)); - } - - List rightSequence = new ArrayList<>(nLeft); - for (int j = 0; j < nRight; j++) { - WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)]; - Arrays.setAll(words, idx -> { - long wordId = wordIds.seenWithP(1.0); - long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId); - return wm(wordId, wordMeta); - }); - - long docId = docIdsRight.seenWithP(docIdsLeft, 0.1); - long docMeta = docMetas.nextUniqueAssociatedWithKey(docId); - rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words)); - } - - var mergedLR = runMergeScenario( - leftSequence, - rightSequence - ); - var mergedRL = runMergeScenario( - rightSequence, - leftSequence - ); - - var actualLR = getData(mergedLR); - var actualRL = getData(mergedRL); - - var expected = simulateMerge(leftSequence, rightSequence); - - assertEquals(actualLR, actualRL); - - if (!expected.equals(actualLR)) { - System.out.println("*fail*"); - System.out.println(leftSequence); - System.out.println(rightSequence); - } - else { - System.out.println("*pass*"); - } - - assertEquals(expected, actualLR); - - } - } - - - public List simulateMerge( - Collection leftInputs, - Collection rightInputs - ) { - TreeMap> wordToDocs = new TreeMap<>(); - - for (var entry : leftInputs) { - for (var wm : entry.wordIds()) { - wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add( - new DocWithMeta(entry.docId(), wm.meta()) - ); - } - } - for (var entry : rightInputs) { - for (var wm : entry.wordIds()) { - wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add( - new DocWithMeta(entry.docId(), wm.meta()) - ); - } - } - - List ret = new ArrayList<>(); - int[] start = new int[1]; - wordToDocs.forEach((wordId, docsList) -> { - docsList.sort(Comparator.naturalOrder()); - var iter = docsList.iterator(); - DocWithMeta prevVal = null; - DocWithMeta currentVal; - while (iter.hasNext()) { - currentVal = iter.next(); - if (prevVal != null) { - if (currentVal.docId == prevVal.docId) { - iter.remove(); - } - } - prevVal = currentVal; - - } - long[] data = new long[docsList.size()*2]; - for (int i = 0; i < docsList.size(); i++) { - data[2*i] = docsList.get(i).docId; - data[2*i + 1] = docsList.get(i).meta; - } - ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data)); - - start[0] += data.length; - }); - return ret; - } - - - record DocWithMeta(long docId, long meta) implements Comparable { - - @Override - public int compareTo(DocWithMeta o) { - return Long.compare(docId, o.docId); - } - } - - class IdSequence { - Set seen = new HashSet<>(); - Map associatedValues = new HashMap<>(); - private Random random = new Random(); - - /** Return alreadySeen() with probability p, - * else nextUnique() - */ - public long seenWithP(double p) { - if (isEmpty() || random.nextDouble() > p) - return nextUnique(); - - return alreadySeenSameSequence(); - } - - public long seenWithP(IdSequence other, double p) { - if (isEmpty() || random.nextDouble() > p) - return nextUnique(); - - return alreadySeenOtherSequence(other); - } - - public long nextUnique() { - for (;;) { - long val = random.nextLong(); - if (seen.add(val)) { - return val; - } - } - } - - public long nextUniqueAssociatedWithKey(long key) { - return associatedValues.computeIfAbsent(key, k -> nextUnique()); - } - - public long alreadySeenSameSequence() { - long[] values = seen.stream().mapToLong(Long::longValue).toArray(); - int idx = random.nextInt(0, values.length); - return values[idx]; - } - - public long alreadySeenOtherSequence(IdSequence other) { - List values = new ArrayList<>(other.seen); - Collections.shuffle(values); - for (Long maybe : values) { - if (seen.add(maybe)) - return maybe; - } - return nextUnique(); - } - - public boolean isEmpty() { - return seen.isEmpty(); - } - } - -} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java deleted file mode 100644 index 0ad3205a..00000000 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexWordSegmentsTest.java +++ /dev/null @@ -1,231 +0,0 @@ -package nu.marginalia.index.construction; - -import nu.marginalia.array.LongArray; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -import static nu.marginalia.index.construction.TestJournalFactory.*; -import static org.junit.jupiter.api.Assertions.*; - -class ReversePreindexWordSegmentsTest { - Path countsFile; - Path wordsIdFile; - Path docsFile; - Path tempDir; - - TestJournalFactory journalFactory; - - @BeforeEach - public void setUp() throws IOException { - journalFactory = new TestJournalFactory(); - - countsFile = Files.createTempFile("counts", ".dat"); - wordsIdFile = Files.createTempFile("words", ".dat"); - docsFile = Files.createTempFile("docs", ".dat"); - tempDir = Files.createTempDirectory("sort"); - } - - @AfterEach - public void tearDown() throws IOException { - journalFactory.clear(); - - Files.deleteIfExists(countsFile); - Files.deleteIfExists(wordsIdFile); - List contents = new ArrayList<>(); - Files.list(tempDir).forEach(contents::add); - for (var tempFile : contents) { - Files.delete(tempFile); - } - Files.delete(tempDir); - } - @Test - public void testWordSegmentsLongWordId() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 1L<<33) - ); - - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(1L<<33, 0, 1) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - @Test - public void testWordSegmentsRepeatedWordId() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 5, 5) - ); - - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(5, 0, 2) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - - @Test - public void testWordSegments1() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33) - ); - - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(-100, 0, 1), - new TestSegmentData(10, 1, 2), - new TestSegmentData(33, 2, 3), - new TestSegmentData(40, 3, 4) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - - @Test - public void testWordSegments2() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33), - new EntryData(0xF00BA4L, 0, 15, 30, -100, 33) - ); - - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var iter = segments.iterator(1); - - List expected = List.of( - new TestSegmentData(-100, 0, 2), - new TestSegmentData(10, 2, 3), - new TestSegmentData(15, 3, 4), - new TestSegmentData(30, 4, 5), - new TestSegmentData(33, 5, 7), - new TestSegmentData(40, 7, 8) - ); - - List actual = new ArrayList<>(); - - while (iter.next()) { - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset)); - } - - assertEquals(expected, actual); - } - - - @Test - public void testWordSegments_ReadIterator() { - LongArray wordsArray = LongArray.allocate(4); - LongArray countsArray = LongArray.allocate(4); - wordsArray.set(0, -1, -2, -3, -4); - countsArray.set(0, 2, 1, 3, 5); - var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null); - - var ritr = segments.iterator(1); - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-1, ritr.wordId); - assertEquals(0, ritr.idx()); - assertEquals(0, ritr.startOffset); - assertEquals(2, ritr.endOffset); - - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-2, ritr.wordId); - assertEquals(1, ritr.idx()); - assertEquals(2, ritr.startOffset); - assertEquals(3, ritr.endOffset); - - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-3, ritr.wordId); - assertEquals(2, ritr.idx()); - assertEquals(3, ritr.startOffset); - assertEquals(6, ritr.endOffset); - - assertTrue(ritr.hasMorePositions()); - assertTrue(ritr.next()); - assertTrue(ritr.isPositionBeforeEnd()); - assertEquals(-4, ritr.wordId); - assertEquals(3, ritr.idx()); - assertEquals(6, ritr.startOffset); - assertEquals(11, ritr.endOffset); - - assertFalse(ritr.hasMorePositions()); - assertFalse(ritr.next()); - assertFalse(ritr.isPositionBeforeEnd()); - - assertEquals(Long.MIN_VALUE, ritr.wordId); - } - - - @Test - public void testWordSegments_ConstructionIterator() { - LongArray wordsArray = LongArray.allocate(4); - LongArray countsArray = LongArray.allocate(4); - wordsArray.set(0, -1, -2, -3, -4); - var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null); - - var citr = segments.constructionIterator(1); - assertEquals(-1, citr.wordId); - assertEquals(0, citr.idx()); - assertTrue(citr.canPutMore()); - assertTrue(citr.putNext(1)); - assertEquals(1, countsArray.get(0)); - - assertEquals(-2, citr.wordId); - assertEquals(1, citr.idx()); - assertTrue(citr.canPutMore()); - assertTrue(citr.putNext(2)); - assertEquals(2, countsArray.get(1)); - - assertEquals(-3, citr.wordId); - assertEquals(2, citr.idx()); - assertTrue(citr.canPutMore()); - assertTrue(citr.putNext(3)); - assertEquals(3, countsArray.get(2)); - - assertEquals(-4, citr.wordId); - assertEquals(3, citr.idx()); - assertTrue(citr.canPutMore()); - assertFalse(citr.putNext(4)); - assertEquals(4, countsArray.get(3)); - - assertEquals(4, citr.idx()); - assertFalse(citr.canPutMore()); - assertEquals(Long.MIN_VALUE, citr.wordId); - } - -} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java deleted file mode 100644 index b122921b..00000000 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/TestJournalFactory.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.index.construction; - -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class TestJournalFactory { - Path tempDir = Files.createTempDirectory("journal"); - - public TestJournalFactory() throws IOException {} - - public void clear() throws IOException { - List toDelete = new ArrayList<>(); - try (var dirStream = Files.list(tempDir)) { - dirStream.forEach(toDelete::add); - } - for (var tempFile : toDelete) { - Files.delete(tempFile); - } - Files.delete(tempDir); - } - - public record EntryData(long docId, long docMeta, long... wordIds) { - @Override - public String toString() { - return "EntryData{" + - "docId=" + docId + - ", docMeta=" + docMeta + - ", wordIds=" + Arrays.toString(wordIds) + - '}'; - } - } - public record EntryDataWithWordMeta(long docId, long docMeta, WordWithMeta... wordIds) { - @Override - public String toString() { - return "EntryDataWithWordMeta{" + - "docId=" + docId + - ", docMeta=" + docMeta + - ", wordIds=" + Arrays.toString(wordIds) + - '}'; - } - } - public record WordWithMeta(long wordId, long meta) {} - - public static WordWithMeta wm(long wordId, long meta) { - return new WordWithMeta(wordId, meta); - } - - IndexJournalReader createReader(EntryData... entries) throws IOException { - Path jf = Files.createTempFile(tempDir, "journal", ".dat"); - - var writer = new IndexJournalWriterSingleFileImpl(jf); - for (var entry : entries) { - long[] data = new long[entry.wordIds.length * 2]; - for (int i = 0; i < entry.wordIds.length; i++) - data[i*2] = entry.wordIds[i]; - - writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), - new IndexJournalEntryData(data)); - } - writer.close(); - var ret = new IndexJournalReaderSingleFile(jf); - return ret; - } - - public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException { - Path jf = Files.createTempFile(tempDir, "journal", ".dat"); - - var writer = new IndexJournalWriterSingleFileImpl(jf); - for (var entry : entries) { - long[] data = new long[entry.wordIds.length * 2]; - for (int i = 0; i < entry.wordIds.length; i++) { - data[i * 2] = entry.wordIds[i].wordId; - data[i * 2 + 1] = entry.wordIds[i].meta; - } - - writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta), - new IndexJournalEntryData(data)); - } - writer.close(); - var ret = new IndexJournalReaderSingleFile(jf); - return ret; - } -} diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java similarity index 65% rename from code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java index d6d81818..8f6e6a14 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexDocsTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexDocsTest.java @@ -1,5 +1,8 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -11,14 +14,15 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import static nu.marginalia.index.construction.TestJournalFactory.EntryData; +import static nu.marginalia.index.construction.full.TestJournalFactory.EntryData; import static org.junit.jupiter.api.Assertions.assertEquals; -class ReversePreindexDocsTest { +class FullPreindexDocsTest { Path countsFile; Path wordsIdFile; Path docsFile; Path tempDir; + Path positionsFile; TestJournalFactory journalFactory; @@ -30,6 +34,7 @@ class ReversePreindexDocsTest { wordsIdFile = Files.createTempFile("words", ".dat"); docsFile = Files.createTempFile("docs", ".dat"); tempDir = Files.createTempDirectory("sort"); + positionsFile = tempDir.resolve("positions.dat"); } @AfterEach @@ -38,6 +43,9 @@ class ReversePreindexDocsTest { Files.deleteIfExists(countsFile); Files.deleteIfExists(wordsIdFile); + Files.deleteIfExists(positionsFile); + Files.deleteIfExists(docsFile); + List contents = new ArrayList<>(); Files.list(tempDir).forEach(contents::add); for (var tempFile : contents) { @@ -46,33 +54,9 @@ class ReversePreindexDocsTest { Files.delete(tempDir); } - @Test - public void testDocs() throws IOException { - var reader = journalFactory.createReader( - new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33) - ); - - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); - - List expected = List.of( - new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }), - new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }), - new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }), - new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 }) - ); - - List actual = new ArrayList<>(); - - var iter = segments.iterator(2); - while (iter.next()) { - long[] data = new long[(int) (iter.endOffset - iter.startOffset)]; - docs.slice(iter.startOffset, iter.endOffset).get(0, data); - actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset, - data)); - } - - assertEquals(expected, actual); + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); } @Test @@ -81,11 +65,13 @@ class ReversePreindexDocsTest { new EntryData(-0xF00BA3L, 0, 4, 4) ); - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); + var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), + new PositionsFileConstructor(positionsFile), + segments); List expected = List.of( - new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) + new TestSegmentData(termId("4"), 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 }) ); List actual = new ArrayList<>(); @@ -100,6 +86,7 @@ class ReversePreindexDocsTest { assertEquals(expected, actual); } + @Test public void testDocs2() throws IOException { var reader = journalFactory.createReader( @@ -107,8 +94,10 @@ class ReversePreindexDocsTest { new EntryData(0xF00BA4L, 0, 15, 30, -100, 33) ); - var segments = ReversePreindexWordSegments.construct(reader, wordsIdFile, countsFile); - var docs = ReversePreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), segments); + var segments = FullPreindexWordSegments.construct(reader, wordsIdFile, countsFile); + var docs = FullPreindexDocuments.construct(docsFile, tempDir, reader, DocIdRewriter.identity(), + new PositionsFileConstructor(positionsFile), + segments); List expected = List.of( new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }), @@ -145,15 +134,15 @@ class ReversePreindexDocsTest { if (wordId != that.wordId) return false; if (start != that.start) return false; if (end != that.end) return false; - return Arrays.equals(data, that.data); + return data[0] == that.data[0]; //Arrays.equals(data, that.data); } @Override public int hashCode() { - int result = (int) (wordId ^ (wordId >>> 32)); - result = 31 * result + (int) (start ^ (start >>> 32)); - result = 31 * result + (int) (end ^ (end >>> 32)); - result = 31 * result + Arrays.hashCode(data); + int result = Long.hashCode(wordId); + result = 31 * result + Long.hashCode(start); + result = 31 * result + Long.hashCode(end); + result = 31 * result + Long.hashCode(data[0]); return result; } diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java similarity index 75% rename from code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java index 1ef2df4e..253e0d52 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/ReversePreindexFinalizeTest.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/FullPreindexFinalizeTest.java @@ -1,9 +1,11 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.btree.BTreeReader; import nu.marginalia.btree.model.BTreeHeader; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.PositionsFileConstructor; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -11,14 +13,17 @@ import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.ArrayList; +import java.util.List; -import static nu.marginalia.index.construction.TestJournalFactory.*; +import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta; +import static nu.marginalia.index.construction.full.TestJournalFactory.wm; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; -class ReversePreindexFinalizeTest { +class FullPreindexFinalizeTest { TestJournalFactory journalFactory; + Path positionsFile; Path countsFile; Path wordsIdFile; Path docsFile; @@ -28,6 +33,7 @@ class ReversePreindexFinalizeTest { public void setUp() throws IOException { journalFactory = new TestJournalFactory(); + positionsFile = Files.createTempFile("positions", ".dat"); countsFile = Files.createTempFile("counts", ".dat"); wordsIdFile = Files.createTempFile("words", ".dat"); docsFile = Files.createTempFile("docs", ".dat"); @@ -48,10 +54,17 @@ class ReversePreindexFinalizeTest { Files.delete(tempDir); } + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); + } + @Test public void testFinalizeSimple() throws IOException { var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51))); - var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); + var preindex = FullPreindex.constructPreindex(reader, + new PositionsFileConstructor(positionsFile), + DocIdRewriter.identity(), tempDir); preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); @@ -76,9 +89,7 @@ class ReversePreindexFinalizeTest { assertEquals(1, wordsHeader.numEntries()); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); - assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); + assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs())); } @@ -89,7 +100,9 @@ class ReversePreindexFinalizeTest { new EntryDataWithWordMeta(101, 101, wm(51, 52)) ); - var preindex = ReversePreindex.constructPreindex(reader, DocIdRewriter.identity(), tempDir); + var preindex = FullPreindex.constructPreindex(reader, + new PositionsFileConstructor(positionsFile), + DocIdRewriter.identity(), tempDir); preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); preindex.delete(); @@ -116,10 +129,8 @@ class ReversePreindexFinalizeTest { long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1); long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3); - assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); - assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs())); - assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1)); + assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs())); + assertEquals(termId("50"), wordsArray.get(wordsHeader.dataOffsetLongs())); BTreeHeader docsHeader; @@ -128,13 +139,11 @@ class ReversePreindexFinalizeTest { assertEquals(1, docsHeader.numEntries()); assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1)); docsHeader = new BTreeHeader(docsArray, offset2); System.out.println(docsHeader); assertEquals(1, docsHeader.numEntries()); assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0)); - assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1)); } } \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java new file mode 100644 index 00000000..1be94b55 --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestJournalFactory.java @@ -0,0 +1,130 @@ +package nu.marginalia.index.construction.full; + +import nu.marginalia.index.journal.IndexJournalPage; +import nu.marginalia.index.journal.IndexJournalSlopWriter; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.sequence.VarintCodedSequence; +import nu.marginalia.test.TestUtil; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +public class TestJournalFactory { + Path tempDir = Files.createTempDirectory("journal"); + + public TestJournalFactory() throws IOException {} + + public void clear() throws IOException { + TestUtil.clearTempDir(tempDir); + } + + public record EntryData(long docId, long docMeta, String... wordIds) { + public EntryData(long docId, long docMeta, long... wordIds) { + this(docId, docMeta, Arrays.stream(wordIds).mapToObj(String::valueOf).toArray(String[]::new)); + } + @Override + public String toString() { + return "EntryData{" + + "docId=" + docId + + ", docMeta=" + docMeta + + ", wordIds=" + Arrays.toString(wordIds) + + '}'; + } + } + public record EntryDataWithWordMeta(long docId, long docMeta, WordWithMeta... wordIds) { + @Override + public String toString() { + return "EntryDataWithWordMeta{" + + "docId=" + docId + + ", docMeta=" + docMeta + + ", wordIds=" + Arrays.toString(wordIds) + + '}'; + } + } + public record WordWithMeta(String wordId, byte meta, VarintCodedSequence gcs) { + public WordWithMeta(long wordId, byte meta, VarintCodedSequence gcs) { + this(String.valueOf(wordId), meta, gcs); + } + } + + public static WordWithMeta wm(long wordId, int meta, int... positions) { + return new WordWithMeta(wordId, (byte) meta, VarintCodedSequence.generate(positions)); + } + + public IndexJournalPage createReader(EntryData... entries) throws IOException { + Path ji = Files.createTempDirectory(tempDir, "journal"); + + var writer = new IndexJournalSlopWriter(ji, 0); + for (var entry : entries) { + String[] termIds = new String[entry.wordIds.length]; + byte[] meta = new byte[entry.wordIds.length]; + + VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length]; + for (int i = 0; i < entry.wordIds.length; i++) { + termIds[i] = entry.wordIds[i]; + meta[i] = 0; + positions[i] = VarintCodedSequence.generate(); + } + + writer.put( + entry.docId, + new SlopDocumentRecord.KeywordsProjection( + "test", + -1, + 0, + entry.docMeta, + 15, + Arrays.asList(termIds), + meta, + Arrays.asList(positions), + new byte[0], + List.of() + ) + ); + } + writer.close(); + + return new IndexJournalPage(ji, 0); + } + + public IndexJournalPage createReader(EntryDataWithWordMeta... entries) throws IOException { + Path ji = Files.createTempDirectory(tempDir, "journal"); + + var writer = new IndexJournalSlopWriter(ji, 0); + for (var entry : entries) { + + String[] termIds = new String[entry.wordIds.length]; + byte[] meta = new byte[entry.wordIds.length]; + VarintCodedSequence[] positions = new VarintCodedSequence[entry.wordIds.length]; + for (int i = 0; i < entry.wordIds.length; i++) { + termIds[i] = entry.wordIds[i].wordId; + meta[i] = entry.wordIds[i].meta; + positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, VarintCodedSequence::generate); + } + + writer.put( + entry.docId, + new SlopDocumentRecord.KeywordsProjection( + "test", + -1, + 0, + entry.docMeta, + 15, + Arrays.asList(termIds), + meta, + Arrays.asList(positions), + new byte[0], + List.of() + ) + ); + + } + writer.close(); + + return new IndexJournalPage(ji, 0); + } +} diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/TestSegmentData.java b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java similarity index 80% rename from code/index/index-reverse/test/nu/marginalia/index/construction/TestSegmentData.java rename to code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java index 574bb61a..d325e029 100644 --- a/code/index/index-reverse/test/nu/marginalia/index/construction/TestSegmentData.java +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/full/TestSegmentData.java @@ -1,9 +1,9 @@ -package nu.marginalia.index.construction; +package nu.marginalia.index.construction.full; import java.util.Arrays; -record TestSegmentData(long wordId, long start, long end, long[] data) { - public TestSegmentData(long wordId, long start, long end) { +record TestSegmentData(String wordId, long start, long end, long[] data) { + public TestSegmentData(String wordId, long start, long end) { this(wordId, start, end, null); } @@ -22,7 +22,7 @@ record TestSegmentData(long wordId, long start, long end, long[] data) { @Override public int hashCode() { - int result = (int) (wordId ^ (wordId >>> 32)); + int result = wordId.hashCode(); result = 31 * result + (int) (start ^ (start >>> 32)); result = 31 * result + (int) (end ^ (end >>> 32)); result = 31 * result + Arrays.hashCode(data); diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java new file mode 100644 index 00000000..e4ced16d --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioDocIdsTransformerTest.java @@ -0,0 +1,128 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.sequence.io.BitReader; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.channels.FileChannel; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; + +import static org.junit.jupiter.api.Assertions.*; + +class PrioDocIdsTransformerTest { + + Path inputFile = null; + Path outputFile = null; + + @BeforeEach + public void setUp() throws IOException { + inputFile = Files.createTempFile("input", ".dat"); + outputFile = Files.createTempFile("output", ".dat"); + } + + @AfterEach + public void tearDown() throws IOException { + if (inputFile != null) { + Files.deleteIfExists(inputFile); + } + if (outputFile != null) { + Files.deleteIfExists(outputFile); + } + } + + @Test + public void testDomainIdDocOrd() throws IOException { + + + try (var writeChannel = (FileChannel) Files.newByteChannel(inputFile, StandardOpenOption.WRITE)) { + var buffer = ByteBuffer.allocate(128).order(ByteOrder.LITTLE_ENDIAN); + + buffer.putLong(UrlIdCodec.encodeId(0, 0)); + buffer.putLong(UrlIdCodec.encodeId(0, 1)); + buffer.putLong(UrlIdCodec.encodeId(1, 0)); + buffer.putLong(UrlIdCodec.encodeId(4, 51) | 0x7000_0000_0000_0000L); + + writeChannel.write(buffer.flip()); + } + + try (var writeChannel = (FileChannel) Files.newByteChannel(outputFile, StandardOpenOption.WRITE); + var readChannel = (FileChannel) Files.newByteChannel(inputFile); + var transformer = new PrioDocIdsTransformer(writeChannel, readChannel)) + { + // Transform two segments of the input file and write them to the output file with prefixed sizes + + transformer.transform(0, 4); + } + + byte[] bytes = Files.readAllBytes(outputFile); + var buffer = ByteBuffer.wrap(bytes); + + + BitReader reader = new BitReader(buffer); + + // read the header + { + int code = reader.get(2); + int size = reader.get(30); + assertEquals(3, code); + assertEquals(4, size); + } + + // read first doc id in parts + int rank = reader.get(7); + int domainId = reader.get(31); + int ordinal = reader.get(26); + + assertEquals(0, rank); + assertEquals(0, domainId); + assertEquals(0, ordinal); + + { + int code = reader.get(2); + assertEquals(0, code); // increment doc ordinal + + int dord = reader.getGamma(); + ordinal += dord; + + assertEquals(1, ordinal); + } + + { + int code = reader.get(2); + assertEquals(1, code); // increment doc ordinal + + int diffDomainId = reader.getDelta(); + domainId += diffDomainId; + assertEquals(1, domainId); + + int abs_ord = reader.getDelta(); + ordinal = abs_ord - 1; + assertEquals(0, ordinal); + } + + { + int code = reader.get(2); + assertEquals(2, code); // increment doc ordinal + + int diffRank = reader.getGamma(); + rank += diffRank; + assertEquals(56, rank); + + domainId = reader.get(31); + ordinal = reader.get(26); + + assertEquals(4, domainId); + assertEquals(51, ordinal); + } + } + +} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java new file mode 100644 index 00000000..6075fa8a --- /dev/null +++ b/code/index/index-reverse/test/nu/marginalia/index/construction/prio/PrioPreindexTest.java @@ -0,0 +1,185 @@ +package nu.marginalia.index.construction.prio; + +import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.PrioReverseIndexReader; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.full.TestJournalFactory; +import nu.marginalia.model.id.UrlIdCodec; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.Random; + +import static nu.marginalia.index.construction.full.TestJournalFactory.EntryDataWithWordMeta; +import static nu.marginalia.index.construction.full.TestJournalFactory.wm; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class PrioPreindexTest { + Path countsFile; + Path wordsIdFile; + Path docsFile; + Path tempDir; + Path positionsFile; + + TestJournalFactory journalFactory; + + @BeforeEach + public void setUp() throws IOException { + journalFactory = new TestJournalFactory(); + + countsFile = Files.createTempFile("counts", ".dat"); + wordsIdFile = Files.createTempFile("words", ".dat"); + docsFile = Files.createTempFile("docs", ".dat"); + tempDir = Files.createTempDirectory("sort"); + positionsFile = tempDir.resolve("positions.dat"); + } + + @AfterEach + public void tearDown() throws IOException { + journalFactory.clear(); + + Files.deleteIfExists(countsFile); + Files.deleteIfExists(wordsIdFile); + Files.deleteIfExists(positionsFile); + Files.deleteIfExists(docsFile); + + List contents = new ArrayList<>(); + Files.list(tempDir).forEach(contents::add); + for (var tempFile : contents) { + Files.delete(tempFile); + } + Files.delete(tempDir); + } + + MurmurHash3_128 hash = new MurmurHash3_128(); + long termId(String keyword) { + return hash.hashKeyword(keyword); + } + + @Test + public void testFinalizeSimple() throws IOException { + var journalReader = journalFactory.createReader( + new EntryDataWithWordMeta(100, 101, wm(50, 51)), + new EntryDataWithWordMeta(104, 101, wm(50, 52)), + new EntryDataWithWordMeta(106, 101, wm(50, 52)) + ); + + var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir); + preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); + preindex.delete(); + + Path wordsFile = tempDir.resolve("words.dat"); + Path docsFile = tempDir.resolve("docs.dat"); + + assertTrue(Files.exists(wordsFile)); + assertTrue(Files.exists(docsFile)); + + var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); + + var entrySource = indexReader.documents(termId("50")); + var lqb = new LongQueryBuffer(32); + entrySource.read(lqb); + + assertEquals(3, lqb.size()); + assertEquals(100, lqb.copyData()[0]); + assertEquals(104, lqb.copyData()[1]); + assertEquals(106, lqb.copyData()[2]); + } + + @Test + public void testFinalizeLargeData() throws IOException { + int rankComponent = 0; + int domainComponent = 0; + int docOrdinal = 0; + var random = new Random(); + long[] documentIds = new long[10000]; + + for (int i = 0; i < documentIds.length; i++) { + int scenario = random.nextInt(0, 3); + + // Avoid going into scenario 3 when we've already reached max rank + // instead fall back into scenario 0 as this should be the more common + // of the two + if (rankComponent == 63 && scenario == 2) { + scenario = 0; + } + + if (scenario == 0) { + docOrdinal += random.nextInt(1, 100); + } else if (scenario == 1) { + domainComponent+=random.nextInt(1, 1000); + docOrdinal=random.nextInt(0, 10000); + } else { + rankComponent = Math.min(63, rankComponent + random.nextInt(1, 2)); + domainComponent=random.nextInt(0, 10000); + docOrdinal=random.nextInt(0, 10000); + } + + documentIds[i] = UrlIdCodec.encodeId(rankComponent, domainComponent, docOrdinal); + } + + EntryDataWithWordMeta[] entries = new EntryDataWithWordMeta[documentIds.length]; + for (int i = 0; i < documentIds.length; i++) { + entries[i] = new EntryDataWithWordMeta(documentIds[i], 101, wm(50, 51)); + } + var journalReader = journalFactory.createReader(entries); + + var preindex = PrioPreindex.constructPreindex(journalReader, DocIdRewriter.identity(), tempDir); + preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat")); + preindex.delete(); + + Path wordsFile = tempDir.resolve("words.dat"); + Path docsFile = tempDir.resolve("docs.dat"); + + assertTrue(Files.exists(wordsFile)); + assertTrue(Files.exists(docsFile)); + + var indexReader = new PrioReverseIndexReader("test", wordsFile, docsFile); + + int items = indexReader.numDocuments(termId("50")); + assertEquals(documentIds.length, items); + + var entrySource = indexReader.documents(termId("50")); + var lqb = new LongQueryBuffer(32); + + for (int pos = 0; pos < documentIds.length;) { + if (!entrySource.hasMore()) { + Assertions.fail("Out of data @ " + pos); + } + + entrySource.read(lqb); + + var dataArray = lqb.copyData(); + for (int i = 0; i < lqb.size(); i++) { + + long currValue = dataArray[i]; + + if (documentIds[i + pos] != currValue) { + System.out.println("Mismatch at position " + (i + pos)); + + long prevValue = documentIds[i + pos - 1]; + long expectedValue = documentIds[i + pos]; + + System.out.println("Prev: " + prevValue + " -> " + UrlIdCodec.getRank(prevValue) + " " + UrlIdCodec.getDomainId(prevValue) + " " + UrlIdCodec.getDocumentOrdinal(prevValue)); + System.out.println("Curr: " + currValue + " -> " + UrlIdCodec.getRank(currValue) + " " + UrlIdCodec.getDomainId(currValue) + " " + UrlIdCodec.getDocumentOrdinal(currValue)); + System.out.println("Exp: " + expectedValue + " -> " + UrlIdCodec.getRank(expectedValue) + " " + UrlIdCodec.getDomainId(expectedValue) + " " + UrlIdCodec.getDocumentOrdinal(expectedValue)); + + assertTrue(currValue > prevValue, "Current value is not greater than previous value"); + + Assertions.fail(); + } + } + pos += lqb.size(); + } + + } +} \ No newline at end of file diff --git a/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java b/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java deleted file mode 100644 index 8fbf6b54..00000000 --- a/code/index/index-reverse/test/nu/marginalia/test/TestUtil.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path dir) { - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/index/java/nu/marginalia/index/IndexFactory.java b/code/index/java/nu/marginalia/index/IndexFactory.java index a1d2f5a5..e388793f 100644 --- a/code/index/java/nu/marginalia/index/IndexFactory.java +++ b/code/index/java/nu/marginalia/index/IndexFactory.java @@ -3,12 +3,11 @@ package nu.marginalia.index; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.IndexLocations; -import nu.marginalia.index.index.CombinedIndexReader; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexReader; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.positions.PositionsFileReader; +import nu.marginalia.storage.FileStorageService; import java.io.IOException; import java.nio.file.Files; @@ -39,16 +38,16 @@ public class IndexFactory { return IndexLocations.getSearchSetsPath(fileStorageService); } - public ReverseIndexReader getReverseIndexReader() throws IOException { - - return new ReverseIndexReader("full", + public FullReverseIndexReader getReverseIndexReader() throws IOException { + return new FullReverseIndexReader("full", ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT), - ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT) + ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT), + new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT)) ); } - public ReverseIndexReader getReverseIndexPrioReader() throws IOException { - return new ReverseIndexReader("prio", + public PrioReverseIndexReader getReverseIndexPrioReader() throws IOException { + return new PrioReverseIndexReader("prio", ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT), ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT) ); @@ -57,7 +56,8 @@ public class IndexFactory { public ForwardIndexReader getForwardIndexReader() throws IOException { return new ForwardIndexReader( ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.CURRENT), - ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT) + ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.CURRENT), + ForwardIndexFileNames.resolve(liveStorage, ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.CURRENT) ); } diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 1c430014..81172a5b 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -8,23 +8,26 @@ import io.prometheus.client.Gauge; import io.prometheus.client.Histogram; import it.unimi.dsi.fastutil.longs.LongArrayList; import lombok.SneakyThrows; -import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.IndexApiGrpc; +import nu.marginalia.api.searchquery.RpcDecoratedResultItem; +import nu.marginalia.api.searchquery.RpcIndexQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.results.*; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexSearchBudget; -import nu.marginalia.index.results.IndexResultValuatorService; +import nu.marginalia.index.results.IndexResultRankingService; import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.index.searchset.SearchSetsService; import nu.marginalia.index.searchset.SmallSearchSet; -import nu.marginalia.index.searchset.SearchSet; import nu.marginalia.service.module.ServiceConfiguration; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -32,7 +35,8 @@ import org.slf4j.Marker; import org.slf4j.MarkerFactory; import java.sql.SQLException; -import java.util.*; +import java.util.BitSet; +import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.Executor; import java.util.concurrent.Executors; @@ -81,7 +85,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { private final StatefulIndex statefulIndex; private final SearchSetsService searchSetsService; - private final IndexResultValuatorService resultValuator; + private final IndexResultRankingService resultValuator; private final String nodeName; @@ -91,7 +95,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { public IndexGrpcService(ServiceConfiguration serviceConfiguration, StatefulIndex statefulIndex, SearchSetsService searchSetsService, - IndexResultValuatorService resultValuator) + IndexResultRankingService resultValuator) { var nodeId = serviceConfiguration.node(); this.nodeName = Integer.toString(nodeId); @@ -110,11 +114,17 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { long endTime = System.currentTimeMillis() + request.getQueryLimits().getTimeoutMs(); - SearchResultSet results = wmsa_query_time + List results = wmsa_query_time .labels(nodeName, "GRPC") .time(() -> { // Perform the search - return executeSearch(params); + try { + return executeSearch(params); + } + catch (Exception ex) { + logger.error("Error in handling request", ex); + return List.of(); + } }); // Prometheus bookkeeping @@ -129,47 +139,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } // Send the results back to the client - for (var result : results.results) { - - var rawResult = result.rawIndexResult; - - var rawItem = RpcRawResultItem.newBuilder(); - rawItem.setCombinedId(rawResult.combinedId); - rawItem.setResultsFromDomain(rawResult.resultsFromDomain); - rawItem.setHtmlFeatures(rawResult.htmlFeatures); - rawItem.setEncodedDocMetadata(rawResult.encodedDocMetadata); - rawItem.setHasPriorityTerms(rawResult.hasPrioTerm); - - for (var score : rawResult.keywordScores) { - rawItem.addKeywordScores( - RpcResultKeywordScore.newBuilder() - .setEncodedWordMetadata(score.encodedWordMetadata()) - .setKeyword(score.keyword) - ); - } - - var decoratedBuilder = RpcDecoratedResultItem.newBuilder() - .setDataHash(result.dataHash) - .setDescription(result.description) - .setFeatures(result.features) - .setFormat(result.format) - .setRankingScore(result.rankingScore) - .setTitle(result.title) - .setUrl(result.url.toString()) - .setUrlQuality(result.urlQuality) - .setWordsTotal(result.wordsTotal) - .setBestPositions(result.bestPositions) - .setRawItem(rawItem); - - var rankingDetails = IndexProtobufCodec.convertRankingDetails(result.rankingDetails); - if (rankingDetails != null) { - decoratedBuilder.setRankingDetails(rankingDetails); - } - - if (result.pubYear != null) { - decoratedBuilder.setPubYear(result.pubYear); - } - responseObserver.onNext(decoratedBuilder.build()); + for (var result : results) { + responseObserver.onNext(result); } responseObserver.onCompleted(); @@ -183,7 +154,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { // exists for test access @SneakyThrows - SearchResultSet justQuery(SearchSpecification specsSet) { + List justQuery(SearchSpecification specsSet) { return executeSearch(new SearchParameters(specsSet, getSearchSet(specsSet))); } @@ -205,11 +176,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier()); } - private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException { + // accessible for tests + public List executeSearch(SearchParameters params) throws SQLException, InterruptedException { if (!statefulIndex.isLoaded()) { // Short-circuit if the index is not loaded, as we trivially know that there can be no results - return new SearchResultSet(List.of()); + return List.of(); } ResultRankingContext rankingContext = createRankingContext(params.rankingParams, @@ -218,7 +190,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { var queryExecution = new QueryExecution(rankingContext, params.fetchSize); - var ret = queryExecution.run(params); + List ret = queryExecution.run(params); wmsa_index_query_exec_block_time .labels(nodeName) @@ -230,30 +202,69 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { return ret; } + /** This class is responsible for ranking the results and adding the best results to the + * resultHeap, which depending on the state of the indexLookup threads may or may not block + */ + private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, + CompiledQuery compiledQuery, + CompiledQueryLong compiledQueryIds) + { + + int[] full = new int[compiledQueryIds.size()]; + int[] prio = new int[compiledQueryIds.size()]; + + BitSet ngramsMask = new BitSet(compiledQuery.size()); + BitSet regularMask = new BitSet(compiledQuery.size()); + + var currentIndex = statefulIndex.get(); + + for (int idx = 0; idx < compiledQueryIds.size(); idx++) { + long id = compiledQueryIds.at(idx); + full[idx] = currentIndex.numHits(id); + prio[idx] = currentIndex.numHitsPrio(id); + + if (compiledQuery.at(idx).contains("_")) { + ngramsMask.set(idx); + } + else { + regularMask.set(idx); + } + } + + return new ResultRankingContext(currentIndex.totalDocCount(), + rankingParams, + ngramsMask, + regularMask, + new CqDataInt(full), + new CqDataInt(prio)); + } + /** This class is responsible for executing a search query. It uses a thread pool to * execute the subqueries and their valuation in parallel. The results are then combined * into a bounded priority queue, and finally the best results are returned. */ private class QueryExecution { + private static final Executor workerPool = Executors.newWorkStealingPool(indexValuationThreads*4); /** The queue where the results from the index lookup threads are placed, * pending ranking by the result ranker threads */ private final ArrayBlockingQueue resultCandidateQueue = new ArrayBlockingQueue<>(8); - private final ResultPriorityQueue resultHeap; + private final ResultRankingContext resultRankingContext; - private final AtomicInteger remainingIndexTasks = new AtomicInteger(0); - private final AtomicInteger remainingValuationTasks = new AtomicInteger(0); + private final AtomicInteger remainingValuationTasks = new AtomicInteger(0); private final AtomicLong blockTime = new AtomicLong(0); + private final AtomicLong stallTime = new AtomicLong(0); public long getStallTime() { return stallTime.get(); } + public long getBlockTime() { return blockTime.get(); } @@ -264,7 +275,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } /** Execute a search query */ - public SearchResultSet run(SearchParameters parameters) throws SQLException, InterruptedException { + public List run(SearchParameters parameters) throws SQLException, InterruptedException { var terms = new SearchTerms(parameters.query, parameters.compiledQueryIds); @@ -281,10 +292,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { awaitCompletion(); // Return the best results - return new SearchResultSet( - resultValuator.selectBestResults(parameters, - resultRankingContext, - resultHeap)); + return resultValuator.selectBestResults(parameters, resultRankingContext, resultHeap); } /** Wait for all tasks to complete */ @@ -295,12 +303,12 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } } } - /** This class is responsible for executing a subquery and adding the results to the * resultCandidateQueue, which depending on the state of the valuator threads may * or may not block */ class IndexLookup implements Runnable { private final IndexQuery query; + private final IndexSearchBudget budget; IndexLookup(IndexQuery query, @@ -315,6 +323,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { try { executeSearch(); } + catch (Exception ex) { + logger.error("Error in index lookup", ex); + } finally { synchronized (remainingIndexTasks) { if (remainingIndexTasks.decrementAndGet() == 0) { @@ -325,10 +336,10 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { } private void executeSearch() { - final LongArrayList results = new LongArrayList(512); + final LongArrayList results = new LongArrayList(64); // These queries are different indices for one subquery - final LongQueryBuffer buffer = new LongQueryBuffer(512); + final LongQueryBuffer buffer = new LongQueryBuffer(64); while (query.hasMore() && budget.hasTimeLeft()) { @@ -339,7 +350,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { results.add(buffer.data.get(i)); } - if (results.size() < 512) { + if (results.size() >= 64) { enqueueResults(new CombinedDocIdList(results)); results.clear(); } @@ -366,13 +377,11 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { logger.warn("Interrupted while waiting to offer resultIds to queue", e); } } - } - /** This class is responsible for ranking the results and adding the best results to the - * resultHeap, which depending on the state of the indexLookup threads may or may not block - */ + } class ResultRanker implements Runnable { private final SearchParameters parameters; + private final ResultRankingContext rankingContext; ResultRanker(SearchParameters parameters, ResultRankingContext rankingContext) { @@ -415,49 +424,16 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { stallTime.addAndGet(System.currentTimeMillis() - start); resultHeap.addAll( - resultValuator.rankResults(parameters, rankingContext, resultIds) + resultValuator.rankResults(parameters, false, rankingContext, resultIds) ); } return true; // keep going } + } } - private ResultRankingContext createRankingContext(ResultRankingParameters rankingParams, - CompiledQuery compiledQuery, - CompiledQueryLong compiledQueryIds) - { - - int[] full = new int[compiledQueryIds.size()]; - int[] prio = new int[compiledQueryIds.size()]; - - BitSet ngramsMask = new BitSet(compiledQuery.size()); - BitSet regularMask = new BitSet(compiledQuery.size()); - - var currentIndex = statefulIndex.get(); - - for (int idx = 0; idx < compiledQueryIds.size(); idx++) { - long id = compiledQueryIds.at(idx); - full[idx] = currentIndex.numHits(id); - prio[idx] = currentIndex.numHitsPrio(id); - - if (compiledQuery.at(idx).contains("_")) { - ngramsMask.set(idx); - } - else { - regularMask.set(idx); - } - } - - return new ResultRankingContext(currentIndex.totalDocCount(), - rankingParams, - ngramsMask, - regularMask, - new CqDataInt(full), - new CqDataInt(prio)); - } - } diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index afc52094..216192cf 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -5,8 +5,10 @@ import it.unimi.dsi.fastutil.longs.LongList; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.index.ReverseIndexReader; +import nu.marginalia.index.FullReverseIndexReader; +import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.forward.ForwardIndexReader; +import nu.marginalia.index.forward.spans.DocumentSpans; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.query.IndexQuery; @@ -14,12 +16,13 @@ import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterStepIf; import nu.marginalia.index.query.limit.SpecificationLimitType; import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.DocMetadataList; +import nu.marginalia.index.results.model.ids.TermMetadataList; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.lang.foreign.Arena; import java.time.Duration; import java.util.ArrayList; import java.util.Collections; @@ -37,30 +40,25 @@ public class CombinedIndexReader { private final Logger logger = LoggerFactory.getLogger(getClass()); private final ForwardIndexReader forwardIndexReader; - private final ReverseIndexReader reverseIndexFullReader; - private final ReverseIndexReader reverseIndexPriorityReader; + private final FullReverseIndexReader reverseIndexFullReader; + private final PrioReverseIndexReader reverseIndexPriorityReader; public CombinedIndexReader(ForwardIndexReader forwardIndexReader, - ReverseIndexReader reverseIndexFullReader, - ReverseIndexReader reverseIndexPriorityReader) { + FullReverseIndexReader reverseIndexFullReader, + PrioReverseIndexReader reverseIndexPriorityReader) { this.forwardIndexReader = forwardIndexReader; this.reverseIndexFullReader = reverseIndexFullReader; this.reverseIndexPriorityReader = reverseIndexPriorityReader; } public IndexQueryBuilderImpl newQueryBuilder(IndexQuery query) { - return new IndexQueryBuilderImpl(reverseIndexFullReader, reverseIndexPriorityReader, query); + return new IndexQueryBuilderImpl(reverseIndexFullReader, query); } public QueryFilterStepIf hasWordFull(long termId) { return reverseIndexFullReader.also(termId); } - public QueryFilterStepIf hasWordPrio(long termId) { - return reverseIndexPriorityReader.also(termId); - } - - /** Creates a query builder for terms in the priority index */ public IndexQueryBuilder findPriorityWord(long wordId) { return newQueryBuilder(new IndexQuery(reverseIndexPriorityReader.documents(wordId))) @@ -113,17 +111,28 @@ public class CombinedIndexReader { return 0; }); - var head = findFullWord(elements.getLong(0)); - for (int i = 1; i < elements.size(); i++) { - head.addInclusionFilter(hasWordFull(elements.getLong(i))); + if (!SearchTerms.stopWords.contains(elements.getLong(0))) { + var head = findFullWord(elements.getLong(0)); + + for (int i = 1; i < elements.size(); i++) { + long termId = elements.getLong(i); + + // if a stop word is present in the query, skip the step of requiring it to be in the document, + // we'll assume it's there and save IO + if (SearchTerms.stopWords.contains(termId)) { + continue; + } + + head.addInclusionFilter(hasWordFull(termId)); + } + queryHeads.add(head); } - queryHeads.add(head); // If there are few paths, we can afford to check the priority index as well if (paths.size() < 4) { var prioHead = findPriorityWord(elements.getLong(0)); for (int i = 1; i < elements.size(); i++) { - prioHead.addInclusionFilter(hasWordPrio(elements.getLong(i))); + prioHead.addInclusionFilter(hasWordFull(elements.getLong(i))); } queryHeads.add(prioHead); } @@ -169,8 +178,11 @@ public class CombinedIndexReader { } /** Retrieves the term metadata for the specified word for the provided documents */ - public DocMetadataList getMetadata(long wordId, CombinedDocIdList docIds) { - return new DocMetadataList(reverseIndexFullReader.getTermMeta(wordId, docIds.array())); + public TermMetadataList getTermMetadata(Arena arena, + long wordId, + CombinedDocIdList docIds) + { + return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array())); } /** Retrieves the document metadata for the specified document */ @@ -188,6 +200,16 @@ public class CombinedIndexReader { return forwardIndexReader.getHtmlFeatures(docId); } + /** Retrieves the HTML features for the specified document */ + public int getDocumentSize(long docId) { + return forwardIndexReader.getDocumentSize(docId); + } + + /** Retrieves the document spans for the specified document */ + public DocumentSpans getDocumentSpans(Arena arena, long docId) { + return forwardIndexReader.getDocumentSpans(arena, docId); + } + /** Close the indexes (this is not done immediately) * */ public void close() throws InterruptedException { diff --git a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java index 0f63fdbc..abdbc836 100644 --- a/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java +++ b/code/index/java/nu/marginalia/index/index/IndexQueryBuilderImpl.java @@ -2,7 +2,7 @@ package nu.marginalia.index.index; import java.util.List; import gnu.trove.set.hash.TLongHashSet; -import nu.marginalia.index.ReverseIndexReader; +import nu.marginalia.index.FullReverseIndexReader; import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQueryBuilder; import nu.marginalia.index.query.filter.QueryFilterAnyOf; @@ -10,8 +10,7 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf; public class IndexQueryBuilderImpl implements IndexQueryBuilder { private final IndexQuery query; - private final ReverseIndexReader reverseIndexFullReader; - private final ReverseIndexReader reverseIndexPrioReader; + private final FullReverseIndexReader reverseIndexFullReader; /* Keep track of already added include terms to avoid redundant checks. * @@ -21,13 +20,10 @@ public class IndexQueryBuilderImpl implements IndexQueryBuilder { * */ private final TLongHashSet alreadyConsideredTerms = new TLongHashSet(); - IndexQueryBuilderImpl(ReverseIndexReader reverseIndexFullReader, - ReverseIndexReader reverseIndexPrioReader, - IndexQuery query) + IndexQueryBuilderImpl(FullReverseIndexReader reverseIndexFullReader, IndexQuery query) { this.query = query; this.reverseIndexFullReader = reverseIndexFullReader; - this.reverseIndexPrioReader = reverseIndexPrioReader; } public IndexQueryBuilder withSourceTerms(long... sourceTerms) { diff --git a/code/index/java/nu/marginalia/index/index/StatefulIndex.java b/code/index/java/nu/marginalia/index/index/StatefulIndex.java index 7da5f74b..41c398bf 100644 --- a/code/index/java/nu/marginalia/index/index/StatefulIndex.java +++ b/code/index/java/nu/marginalia/index/index/StatefulIndex.java @@ -90,7 +90,7 @@ public class StatefulIndex { return combinedIndexReader != null; } - /** Stronger version of isAvailable() that also checks that the index is loaded */ + /** Stronger page of isAvailable() that also checks that the index is loaded */ public boolean isLoaded() { return combinedIndexReader != null && combinedIndexReader.isLoaded(); } diff --git a/code/index/java/nu/marginalia/index/model/SearchTerms.java b/code/index/java/nu/marginalia/index/model/SearchTerms.java index 8115c109..2a475754 100644 --- a/code/index/java/nu/marginalia/index/model/SearchTerms.java +++ b/code/index/java/nu/marginalia/index/model/SearchTerms.java @@ -1,21 +1,26 @@ package nu.marginalia.index.model; import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongArraySet; import it.unimi.dsi.fastutil.longs.LongComparator; import it.unimi.dsi.fastutil.longs.LongList; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.query.SearchQuery; -import java.util.ArrayList; -import java.util.List; - import static nu.marginalia.index.model.SearchTermsUtil.getWordId; public final class SearchTerms { private final LongList advice; private final LongList excludes; private final LongList priority; - private final List coherences; + + public static final LongArraySet stopWords = new LongArraySet( + new long[] { + getWordId("a"), + getWordId("an"), + getWordId("the"), + } + ); private final CompiledQueryLong compiledQueryIds; @@ -24,7 +29,7 @@ public final class SearchTerms { { this.excludes = new LongArrayList(); this.priority = new LongArrayList(); - this.coherences = new ArrayList<>(); + this.advice = new LongArrayList(); this.compiledQueryIds = compiledQueryIds; @@ -32,16 +37,6 @@ public final class SearchTerms { advice.add(getWordId(word)); } - for (var coherence : query.searchTermCoherences) { - LongList parts = new LongArrayList(coherence.size()); - - for (var word : coherence) { - parts.add(getWordId(word)); - } - - coherences.add(parts); - } - for (var word : query.searchTermsExclude) { excludes.add(getWordId(word)); } @@ -72,10 +67,6 @@ public final class SearchTerms { return priority; } - public List coherences() { - return coherences; - } - public CompiledQueryLong compiledQuery() { return compiledQueryIds; } } diff --git a/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java new file mode 100644 index 00000000..95b665ff --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/Bm25GraphVisitor.java @@ -0,0 +1,93 @@ +package nu.marginalia.index.results; + +import nu.marginalia.api.searchquery.model.compiled.CqDataInt; +import nu.marginalia.api.searchquery.model.compiled.CqExpression; +import nu.marginalia.api.searchquery.model.results.Bm25Parameters; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; + +import java.util.BitSet; +import java.util.List; + +/** Visitor for calculating the best BM25 score for a graph representing a search query + */ +public class Bm25GraphVisitor implements CqExpression.DoubleVisitor { + private static final long AVG_LENGTH = 5000; + + private final float[] counts; + private final CqDataInt frequencies; + + private final double k1; + private final double b; + + private final int docCount; + private final int length; + + private final BitSet mask; + + public Bm25GraphVisitor(Bm25Parameters bm25Parameters, + float[] counts, + int length, + ResultRankingContext ctx) { + this.length = length; + + this.k1 = bm25Parameters.k(); + this.b = bm25Parameters.b(); + + this.docCount = ctx.termFreqDocCount(); + this.counts = counts; + this.frequencies = ctx.fullCounts; + this.mask = ctx.regularMask; + } + + @Override + public double onAnd(List parts) { + double value = 0; + + for (var part : parts) { + value += part.visit(this); + } + + return value; + } + + @Override + public double onOr(List parts) { + double value = 0; + for (var part : parts) { + value = Math.max(value, part.visit(this)); + } + return value; + } + + @Override + public double onLeaf(int idx) { + if (!mask.get(idx)) { + return 0; + } + + double count = counts[idx]; + int freq = frequencies.get(idx); + + return invFreq(docCount, freq) * f(count, length); + } + + /** + * + * @param docCount Number of documents + * @param freq Number of matching documents + */ + private double invFreq(int docCount, int freq) { + return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); + } + + /** + * + * @param count number of occurrences in the document + * @param length document length + */ + private double f(double count, int length) { + final double lengthRatio = (double) length / AVG_LENGTH; + + return (count * (k1 + 1)) / (count + k1 * (1 - b + b * lengthRatio)); + } +} diff --git a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java b/code/index/java/nu/marginalia/index/results/IndexMetadataService.java deleted file mode 100644 index d068c0f4..00000000 --- a/code/index/java/nu/marginalia/index/results/IndexMetadataService.java +++ /dev/null @@ -1,91 +0,0 @@ -package nu.marginalia.index.results; - -import com.google.inject.Inject; -import gnu.trove.map.hash.TObjectLongHashMap; -import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; -import it.unimi.dsi.fastutil.longs.LongArrayList; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.query.SearchQuery; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchTermsUtil; -import nu.marginalia.index.results.model.QuerySearchTerms; -import nu.marginalia.index.results.model.TermCoherenceGroupList; -import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.TermIdList; - -import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup; -import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata; - -public class IndexMetadataService { - private final StatefulIndex statefulIndex; - - @Inject - public IndexMetadataService(StatefulIndex index) { - this.statefulIndex = index; - } - - public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll, - TermIdList termIdsList) - { - var currentIndex = statefulIndex.get(); - - Long2ObjectArrayMap termdocToMeta = - new Long2ObjectArrayMap<>(termIdsList.size()); - - for (long termId : termIdsList.array()) { - var metadata = currentIndex.getMetadata(termId, combinedIdsAll); - - termdocToMeta.put(termId, - new DocumentsWithMetadata(combinedIdsAll, metadata)); - } - - return new TermMetadataForCombinedDocumentIds(termdocToMeta); - } - - public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { - - LongArrayList termIdsList = new LongArrayList(); - LongArrayList termIdsPrio = new LongArrayList(); - - TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); - - for (String word : compiledQuery) { - long id = SearchTermsUtil.getWordId(word); - termIdsList.add(id); - termToId.put(word, id); - } - - for (var term : searchQuery.searchTermsAdvice) { - if (termToId.containsKey(term)) { - continue; - } - - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termToId.put(term, id); - } - - for (var term : searchQuery.searchTermsPriority) { - if (termToId.containsKey(term)) { - long id = SearchTermsUtil.getWordId(term); - termIdsPrio.add(id); - } - else { - long id = SearchTermsUtil.getWordId(term); - termIdsList.add(id); - termIdsPrio.add(id); - termToId.put(term, id); - } - } - - return new QuerySearchTerms(termToId, - new TermIdList(termIdsList), - new TermIdList(termIdsPrio), - new TermCoherenceGroupList( - searchQuery.searchTermCoherences.stream().map(TermCoherenceGroup::new).toList() - ) - ); - } - -} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java new file mode 100644 index 00000000..f477b437 --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java @@ -0,0 +1,325 @@ +package nu.marginalia.index.results; + +import com.google.inject.Inject; +import com.google.inject.Singleton; +import gnu.trove.list.TLongList; +import gnu.trove.list.array.TLongArrayList; +import gnu.trove.map.hash.TObjectLongHashMap; +import it.unimi.dsi.fastutil.longs.LongArrayList; +import it.unimi.dsi.fastutil.longs.LongOpenHashSet; +import nu.marginalia.api.searchquery.*; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CqDataLong; +import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint; +import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.SearchParameters; +import nu.marginalia.index.model.SearchTermsUtil; +import nu.marginalia.index.results.model.PhraseConstraintGroupList; +import nu.marginalia.index.results.model.QuerySearchTerms; +import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.index.results.model.ids.TermIdList; +import nu.marginalia.index.results.model.ids.TermMetadataList; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.sequence.CodedSequence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.foreign.Arena; +import java.sql.SQLException; +import java.util.*; + +@Singleton +public class IndexResultRankingService { + private static final Logger logger = LoggerFactory.getLogger(IndexResultRankingService.class); + + private final DocumentDbReader documentDbReader; + private final StatefulIndex statefulIndex; + + @Inject + public IndexResultRankingService(DocumentDbReader documentDbReader, + StatefulIndex statefulIndex) + { + this.documentDbReader = documentDbReader; + this.statefulIndex = statefulIndex; + } + + public List rankResults(SearchParameters params, + boolean exportDebugData, + ResultRankingContext rankingContext, + CombinedDocIdList resultIds) + { + IndexResultScoreCalculator resultRanker = new IndexResultScoreCalculator(statefulIndex, rankingContext, params); + + List results = new ArrayList<>(resultIds.size()); + + // Get the current index reader, which is the one we'll use for this calculation, + // this may change during the calculation, but we don't want to switch over mid-calculation + final CombinedIndexReader currentIndex = statefulIndex.get(); + + final QuerySearchTerms searchTerms = getSearchTerms(params.compiledQuery, params.query); + final int termCount = searchTerms.termIdsAll.size(); + + // We use an arena for the position data to avoid gc pressure + // from the gamma coded sequences, which can be large and have a lifetime + // that matches the try block here + try (var arena = Arena.ofConfined()) { + + TermMetadataList[] termsForDocs = new TermMetadataList[termCount]; + for (int ti = 0; ti < termCount; ti++) { + termsForDocs[ti] = currentIndex.getTermMetadata(arena, searchTerms.termIdsAll.at(ti), resultIds); + } + + // Data for the document. We arrange this in arrays outside the calculation function to avoid + // hash lookups in the inner loop, as it's hot code, and we don't want unnecessary cpu cache + // thrashing in there; out here we can rely on implicit array ordering to match up the data. + + long[] flags = new long[termCount]; + CodedSequence[] positions = new CodedSequence[termCount]; + + // Iterate over documents by their index in the combinedDocIds, as we need the index for the + // term data arrays as well + + for (int i = 0; i < resultIds.size(); i++) { + + // Prepare term-level data for the document + for (int ti = 0; ti < flags.length; ti++) { + var tfd = termsForDocs[ti]; + + assert tfd != null : "No term data for term " + ti; + + flags[ti] = tfd.flag(i); + positions[ti] = tfd.position(i); + } + + // Ignore documents that don't match the mandatory constraints + if (!searchTerms.phraseConstraints.testMandatory(positions)) { + continue; + } + + if (!exportDebugData) { + var score = resultRanker.calculateScore(arena, null, resultIds.at(i), searchTerms, flags, positions); + if (score != null) { + results.add(score); + } + } + else { + var rankingFactors = new DebugRankingFactors(); + var score = resultRanker.calculateScore(arena, rankingFactors, resultIds.at(i), searchTerms, flags, positions); + if (score != null) { + score.debugRankingFactors = rankingFactors; + results.add(score); + } + } + } + + return results; + } + } + + + public List selectBestResults(SearchParameters params, + ResultRankingContext resultRankingContext, + Collection results) throws SQLException { + + var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); + + List resultsList = new ArrayList<>(results.size()); + TLongList idsList = new TLongArrayList(params.limitTotal); + + for (var item : results) { + if (domainCountFilter.test(item)) { + + if (resultsList.size() < params.limitTotal) { + resultsList.add(item); + idsList.add(item.getDocumentId()); + } + // + // else { break; } <-- don't add this even though it looks like it should be present! + // + // It's important that this filter runs across all results, not just the top N, + // so we shouldn't break the loop in a putative else-case here! + // + + } + } + + // If we're exporting debug data from the ranking, we need to re-run the ranking calculation + // for the selected results, as this would be comically expensive to do for all the results we + // discard along the way + + if (params.rankingParams.exportDebugData) { + var combinedIdsList = new LongArrayList(resultsList.size()); + for (var item : resultsList) { + combinedIdsList.add(item.combinedId); + } + + resultsList.clear(); + resultsList.addAll(this.rankResults( + params, + true, + resultRankingContext, + new CombinedDocIdList(combinedIdsList)) + ); + } + + // Fetch the document details for the selected results in one go, from the local document database + // for this index partition + Map detailsById = new HashMap<>(idsList.size()); + for (var item : documentDbReader.getUrlDetails(idsList)) { + detailsById.put(item.urlId(), item); + } + + List resultItems = new ArrayList<>(resultsList.size()); + LongOpenHashSet seenDocumentHashes = new LongOpenHashSet(resultsList.size()); + + // Decorate the results with the document details + for (SearchResultItem result : resultsList) { + final long id = result.getDocumentId(); + final DocdbUrlDetail docData = detailsById.get(id); + + if (docData == null) { + logger.warn("No document data for id {}", id); + continue; + } + + // Filter out duplicates by content + if (!seenDocumentHashes.add(docData.dataHash())) { + continue; + } + + var rawItem = RpcRawResultItem.newBuilder(); + + rawItem.setCombinedId(result.combinedId); + rawItem.setHtmlFeatures(result.htmlFeatures); + rawItem.setEncodedDocMetadata(result.encodedDocMetadata); + rawItem.setHasPriorityTerms(result.hasPrioTerm); + + for (var score : result.keywordScores) { + rawItem.addKeywordScores( + RpcResultKeywordScore.newBuilder() + .setFlags(score.flags) + .setPositions(score.positionCount) + .setKeyword(score.keyword) + ); + } + + var decoratedBuilder = RpcDecoratedResultItem.newBuilder() + .setDataHash(docData.dataHash()) + .setDescription(docData.description()) + .setFeatures(docData.features()) + .setFormat(docData.format()) + .setRankingScore(result.getScore()) + .setTitle(docData.title()) + .setUrl(docData.url().toString()) + .setUrlQuality(docData.urlQuality()) + .setWordsTotal(docData.wordsTotal()) + .setBestPositions(result.getBestPositions()) + .setResultsFromDomain(domainCountFilter.getCount(result)) + .setRawItem(rawItem); + + if (docData.pubYear() != null) { + decoratedBuilder.setPubYear(docData.pubYear()); + } + + if (result.debugRankingFactors != null) { + var debugFactors = result.debugRankingFactors; + var detailsBuilder = RpcResultRankingDetails.newBuilder(); + var documentOutputs = RpcResultDocumentRankingOutputs.newBuilder(); + + for (var factor : debugFactors.getDocumentFactors()) { + documentOutputs.addFactor(factor.factor()); + documentOutputs.addValue(factor.value()); + } + + detailsBuilder.setDocumentOutputs(documentOutputs); + + var termOutputs = RpcResultTermRankingOutputs.newBuilder(); + + CqDataLong termIds = params.compiledQueryIds.data;; + + for (var entry : debugFactors.getTermFactors()) { + String term = "[ERROR IN LOOKUP]"; + + // CURSED: This is a linear search, but the number of terms is small, and it's in a debug path + for (int i = 0; i < termIds.size(); i++) { + if (termIds.get(i) == entry.termId()) { + term = params.compiledQuery.at(i); + break; + } + } + + termOutputs + .addTermId(entry.termId()) + .addTerm(term) + .addFactor(entry.factor()) + .addValue(entry.value()); + } + + detailsBuilder.setTermOutputs(termOutputs); + decoratedBuilder.setRankingDetails(detailsBuilder); + } + + resultItems.add(decoratedBuilder.build()); + } + + return resultItems; + } + + + public QuerySearchTerms getSearchTerms(CompiledQuery compiledQuery, SearchQuery searchQuery) { + + LongArrayList termIdsList = new LongArrayList(); + + TObjectLongHashMap termToId = new TObjectLongHashMap<>(10, 0.75f, -1); + + for (String word : compiledQuery) { + long id = SearchTermsUtil.getWordId(word); + termIdsList.add(id); + termToId.put(word, id); + } + + for (var term : searchQuery.searchTermsPriority) { + if (termToId.containsKey(term)) { + continue; + } + + long id = SearchTermsUtil.getWordId(term); + termIdsList.add(id); + termToId.put(term, id); + } + + var idsAll = new TermIdList(termIdsList); + + var constraintsMandatory = new ArrayList(); + var constraintsFull = new ArrayList(); + var constraintsOptional = new ArrayList(); + + for (var constraint : searchQuery.phraseConstraints) { + switch (constraint) { + case SearchPhraseConstraint.Mandatory(List terms) -> + constraintsMandatory.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll)); + case SearchPhraseConstraint.Optional(List terms) -> + constraintsOptional.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll)); + case SearchPhraseConstraint.Full(List terms) -> + constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(terms, idsAll)); + } + } + + if (constraintsFull.isEmpty()) { + logger.warn("No full constraints in query, adding empty group"); + constraintsFull.add(new PhraseConstraintGroupList.PhraseConstraintGroup(List.of(), idsAll)); + } + + + return new QuerySearchTerms(termToId, + idsAll, + new PhraseConstraintGroupList(constraintsFull.getFirst(), constraintsMandatory, constraintsOptional) + ); + } +} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java new file mode 100644 index 00000000..105eb3e0 --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -0,0 +1,548 @@ +package nu.marginalia.index.results; + +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; +import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.results.ResultRankingContext; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.api.searchquery.model.results.debug.DebugRankingFactors; +import nu.marginalia.index.forward.spans.DocumentSpans; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.model.QueryParams; +import nu.marginalia.index.model.SearchParameters; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.results.model.PhraseConstraintGroupList; +import nu.marginalia.index.results.model.QuerySearchTerms; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.SequenceOperations; + +import javax.annotation.Nullable; +import java.lang.foreign.Arena; +import java.util.BitSet; + +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate; +import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate; + +/** This class is responsible for calculating the score of a search result. + * It holds the data required to perform the scoring, as there is strong + * reasons to cache this data, and performs the calculations */ +public class IndexResultScoreCalculator { + private final CombinedIndexReader index; + private final QueryParams queryParams; + + private final ResultRankingContext rankingContext; + private final CompiledQuery compiledQuery; + + public IndexResultScoreCalculator(StatefulIndex statefulIndex, + ResultRankingContext rankingContext, + SearchParameters params) + { + this.index = statefulIndex.get(); + this.rankingContext = rankingContext; + + this.queryParams = params.queryParams; + this.compiledQuery = params.compiledQuery; + } + + @Nullable + public SearchResultItem calculateScore(Arena arena, + @Nullable DebugRankingFactors debugRankingFactors, + long combinedId, + QuerySearchTerms searchTerms, + long[] wordFlags, + CodedSequence[] positions) + { + + CompiledQuery positionsQuery = compiledQuery.root.newQuery(positions); + + // If the document is not relevant to the query, abort early to reduce allocations and + // avoid unnecessary calculations + + CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags); + if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) { + return null; + } + + boolean allSynthetic = booleanAggregate(wordFlagsQuery, flags -> WordFlags.Synthetic.isPresent((byte) flags)); + int minFlagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & 0xff)); + int minPositionsCount = intMaxMinAggregate(positionsQuery, pos -> pos == null ? 0 : pos.valueCount()); + + if (minFlagsCount == 0 && !allSynthetic && minPositionsCount == 0) { + return null; + } + + long docId = UrlIdCodec.removeRank(combinedId); + long docMetadata = index.getDocumentMetadata(docId); + int htmlFeatures = index.getHtmlFeatures(docId); + + int docSize = index.getDocumentSize(docId); + if (docSize <= 0) docSize = 5000; + + DocumentSpans spans = index.getDocumentSpans(arena, docId); + + if (debugRankingFactors != null) { + debugRankingFactors.addDocumentFactor("doc.docId", Long.toString(combinedId)); + debugRankingFactors.addDocumentFactor("doc.combinedId", Long.toString(docId)); + } + + // Decode the coded positions lists into plain IntLists as at this point we will be + // going over them multiple times + IntList[] decodedPositions = new IntList[positions.length]; + for (int i = 0; i < positions.length; i++) { + if (positions[i] != null) { + decodedPositions[i] = positions[i].values(); + } + else { + decodedPositions[i] = IntList.of(); + } + } + + var params = rankingContext.params; + + double documentBonus = calculateDocumentBonus(docMetadata, htmlFeatures, docSize, params, debugRankingFactors); + + VerbatimMatches verbatimMatches = new VerbatimMatches(decodedPositions, searchTerms.phraseConstraints, spans); + UnorderedMatches unorderedMatches = new UnorderedMatches(decodedPositions, compiledQuery, rankingContext.regularMask, spans); + + float proximitiyFac = getProximitiyFac(decodedPositions, searchTerms.phraseConstraints, verbatimMatches, unorderedMatches, spans); + + double score_firstPosition = params.tcfFirstPosition * (1.0 / Math.sqrt(unorderedMatches.firstPosition)); + double score_verbatim = params.tcfVerbatim * verbatimMatches.getScore(); + double score_proximity = params.tcfProximity * proximitiyFac; + double score_bM25 = params.bm25Weight + * wordFlagsQuery.root.visit(new Bm25GraphVisitor(params.bm25Params, unorderedMatches.getWeightedCounts(), docSize, rankingContext)) + / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1)); + double score_bFlags = params.bm25Weight + * wordFlagsQuery.root.visit(new TermFlagsGraphVisitor(params.bm25Params, wordFlagsQuery.data, unorderedMatches.getWeightedCounts(), rankingContext)) + / (Math.sqrt(unorderedMatches.searchableKeywordCount + 1)); + + double score = normalize( + score_firstPosition + score_proximity + score_verbatim + + score_bM25 + + score_bFlags + + Math.max(0, documentBonus), + -Math.min(0, documentBonus)); + + if (Double.isNaN(score)) { // This should never happen but if it does, we want to know about it + if (getClass().desiredAssertionStatus()) { + throw new IllegalStateException("NaN in result value calculation"); + } + score = Double.MAX_VALUE; + } + + // Capture ranking factors for debugging + if (debugRankingFactors != null) { + debugRankingFactors.addDocumentFactor("score.bm25-main", Double.toString(score_bM25)); + debugRankingFactors.addDocumentFactor("score.bm25-flags", Double.toString(score_bFlags)); + debugRankingFactors.addDocumentFactor("score.verbatim", Double.toString(score_verbatim)); + debugRankingFactors.addDocumentFactor("score.proximity", Double.toString(score_proximity)); + debugRankingFactors.addDocumentFactor("score.firstPosition", Double.toString(score_firstPosition)); + + for (int i = 0; i < searchTerms.termIdsAll.size(); i++) { + long termId = searchTerms.termIdsAll.at(i); + + var flags = wordFlagsQuery.at(i); + + debugRankingFactors.addTermFactor(termId, "flags.rawEncoded", Long.toString(flags)); + + for (var flag : WordFlags.values()) { + if (flag.isPresent((byte) flags)) { + debugRankingFactors.addTermFactor(termId, "flags." + flag.name(), "true"); + } + } + + for (HtmlTag tag : HtmlTag.includedTags) { + if (verbatimMatches.get(tag)) { + debugRankingFactors.addTermFactor(termId, "verbatim." + tag.name().toLowerCase(), "true"); + } + } + + if (positions[i] != null) { + debugRankingFactors.addTermFactor(termId, "positions.all", positions[i].iterator()); + debugRankingFactors.addTermFactor(termId, "positions.title", SequenceOperations.findIntersections(spans.title.positionValues(), decodedPositions[i]).iterator()); + debugRankingFactors.addTermFactor(termId, "positions.heading", SequenceOperations.findIntersections(spans.heading.positionValues(), decodedPositions[i]).iterator()); + debugRankingFactors.addTermFactor(termId, "positions.anchor", SequenceOperations.findIntersections(spans.anchor.positionValues(), decodedPositions[i]).iterator()); + debugRankingFactors.addTermFactor(termId, "positions.code", SequenceOperations.findIntersections(spans.code.positionValues(), decodedPositions[i]).iterator()); + debugRankingFactors.addTermFactor(termId, "positions.nav", SequenceOperations.findIntersections(spans.nav.positionValues(), decodedPositions[i]).iterator()); + debugRankingFactors.addTermFactor(termId, "positions.body", SequenceOperations.findIntersections(spans.body.positionValues(), decodedPositions[i]).iterator()); + debugRankingFactors.addTermFactor(termId, "positions.externalLinkText", SequenceOperations.findIntersections(spans.externalLinkText.positionValues(), decodedPositions[i]).iterator()); + } + } + } + + return new SearchResultItem(combinedId, + docMetadata, + htmlFeatures, + score, + calculatePositionsMask(decodedPositions, searchTerms.phraseConstraints) + ); + } + + private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, + QueryStrategy queryStrategy) + { + if (queryStrategy == QueryStrategy.AUTO || + queryStrategy == QueryStrategy.SENTENCE || + queryStrategy == QueryStrategy.TOPIC) { + return true; + } + + return booleanAggregate(queryGraphScores, + flags -> meetsQueryStrategyRequirements((byte) flags, queryParams.queryStrategy())); + } + + private boolean meetsQueryStrategyRequirements(byte flags, QueryStrategy queryStrategy) { + if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { + return WordFlags.Site.isPresent(flags); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { + return WordFlags.Subjects.isPresent(flags); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { + return WordFlags.Title.isPresent(flags); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { + return WordFlags.UrlPath.isPresent(flags); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { + return WordFlags.UrlDomain.isPresent(flags); + } + else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { + return WordFlags.ExternalLink.isPresent(flags); + } + return true; + } + + /** Calculate a bitmask illustrating the intersected positions of the search terms in the document. + * This is used in the GUI. + * */ + private long calculatePositionsMask(IntList[] positions, PhraseConstraintGroupList phraseConstraints) { + + long result = 0; + int bit = 0; + + IntIterator intersection = phraseConstraints.getFullGroup().findIntersections(positions).intIterator(); + + while (intersection.hasNext() && bit < 64) { + bit = (int) (Math.sqrt(intersection.nextInt())); + result |= 1L << bit; + } + + return result; + } + + + private double calculateDocumentBonus(long documentMetadata, + int features, + int length, + ResultRankingParameters rankingParams, + @Nullable DebugRankingFactors debugRankingFactors) { + + int rank = DocumentMetadata.decodeRank(documentMetadata); + int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); + int quality = DocumentMetadata.decodeQuality(documentMetadata); + int size = DocumentMetadata.decodeSize(documentMetadata); + int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size); + int topology = DocumentMetadata.decodeTopology(documentMetadata); + int year = DocumentMetadata.decodeYear(documentMetadata); + + double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty); + + final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams); + final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus; + final double topologyBonus = Math.log(1 + topology); + final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty; + final double temporalBias; + + if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) { + temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight; + } else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) { + temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight; + } else { + temporalBias = 0; + } + + if (debugRankingFactors != null) { + debugRankingFactors.addDocumentFactor("documentBonus.averageSentenceLengthPenalty", Double.toString(averageSentenceLengthPenalty)); + debugRankingFactors.addDocumentFactor("documentBonus.documentLengthPenalty", Double.toString(documentLengthPenalty)); + debugRankingFactors.addDocumentFactor("documentBonus.qualityPenalty", Double.toString(qualityPenalty)); + debugRankingFactors.addDocumentFactor("documentBonus.rankingBonus", Double.toString(rankingBonus)); + debugRankingFactors.addDocumentFactor("documentBonus.topologyBonus", Double.toString(topologyBonus)); + debugRankingFactors.addDocumentFactor("documentBonus.temporalBias", Double.toString(temporalBias)); + debugRankingFactors.addDocumentFactor("documentBonus.flagsPenalty", Double.toString(flagsPenalty)); + } + + return averageSentenceLengthPenalty + + documentLengthPenalty + + qualityPenalty + + rankingBonus + + topologyBonus + + temporalBias + + flagsPenalty; + } + + /** Calculate the proximity factor for the document. + *

+ * The proximity factor is a bonus based on how close the search terms are to each other in the document + * that turns into a penalty if the distance is too large. + * */ + private static float getProximitiyFac(IntList[] positions, + PhraseConstraintGroupList constraintGroups, + VerbatimMatches verbatimMatches, + UnorderedMatches unorderedMatches, + DocumentSpans spans + ) { + float proximitiyFac = 0; + + if (positions.length > 2) { + int minDist = constraintGroups.getFullGroup().minDistance(positions); + if (minDist > 0 && minDist < Integer.MAX_VALUE) { + if (minDist < 32) { + // If min-dist is sufficiently small, we give a tapering reward to the document + proximitiyFac = 2.0f / (0.1f + (float) Math.sqrt(minDist)); + } else { + // if it is too large, we add a mounting penalty + proximitiyFac = -1.0f * (float) Math.sqrt(minDist); + } + } + } + + + // Give bonus proximity score if all keywords are in the title + if (!verbatimMatches.get(HtmlTag.TITLE) && unorderedMatches.searchableKeywordCount > 2 && unorderedMatches.getObservationCount(HtmlTag.TITLE) == unorderedMatches.searchableKeywordCount) { + proximitiyFac += unorderedMatches.getObservationCount(HtmlTag.TITLE) * (2.5f + 2.f / Math.max(1, spans.title.length())); + } + // Give bonus proximity score if all keywords are in a heading + if (spans.heading.size() < 64 && + ! verbatimMatches.get(HtmlTag.HEADING) + && unorderedMatches.getObservationCount(HtmlTag.HEADING) == unorderedMatches.searchableKeywordCount) + { + proximitiyFac += 1.0f * unorderedMatches.getObservationCount(HtmlTag.HEADING); + } + + return proximitiyFac; + } + + /** A helper class for capturing the verbatim phrase matches in the document */ + private static class VerbatimMatches { + private final BitSet matches; + private float score = 0.f; + + private static final float[] weights_full; + private static final float[] weights_partial; + + static { + weights_full = new float[HtmlTag.includedTags.length]; + weights_partial = new float[HtmlTag.includedTags.length]; + + for (int i = 0; i < weights_full.length; i++) { + weights_full[i] = switch(HtmlTag.includedTags[i]) { + case TITLE -> 4.0f; + case HEADING -> 1.5f; + case ANCHOR -> 0.2f; + case NAV -> 0.1f; + case CODE -> 0.25f; + case EXTERNAL_LINKTEXT -> 1.0f; + case BODY -> 1.0f; + default -> 0.0f; + }; + } + + for (int i = 0; i < weights_partial.length; i++) { + weights_partial[i] = switch(HtmlTag.includedTags[i]) { + case TITLE -> 1.5f; + case HEADING -> 1.f; + case ANCHOR -> 0.2f; + case NAV -> 0.1f; + case CODE -> 0.25f; + case EXTERNAL_LINKTEXT -> 1.0f; + case BODY -> 0.25f; + default -> 0.0f; + }; + } + } + + public VerbatimMatches(IntList[] positions, PhraseConstraintGroupList constraints, DocumentSpans spans) { + matches = new BitSet(HtmlTag.includedTags.length); + + int largestOptional = constraints.getFullGroup().size; + if (largestOptional < 2) { + return; + } + + // Capture full query matches + var fullGroup = constraints.getFullGroup(); + IntList fullGroupIntersections = fullGroup.findIntersections(positions); + for (var tag : HtmlTag.includedTags) { + if (spans.getSpan(tag).containsRange(fullGroupIntersections, fullGroup.size)) { + matches.set(tag.ordinal()); + score += weights_full[tag.ordinal()] * fullGroup.size; + } + } + + // For optional groups, we scale the score by the size of the group relative to the full group + for (var optionalGroup : constraints.getOptionalGroups()) { + int groupSize = optionalGroup.size; + float sizeScalingFactor = groupSize / (float) largestOptional; + + IntList intersections = optionalGroup.findIntersections(positions); + for (var tag : HtmlTag.includedTags) { + if (spans.getSpan(tag).containsRange(intersections, groupSize)) { + score += weights_partial[tag.ordinal()] * optionalGroup.size * sizeScalingFactor; + } + } + } + } + + public boolean get(HtmlTag tag) { + assert !tag.exclude; + return matches.get(tag.ordinal()); + } + + public float getScore() { + return score; + } + } + + /** A helper class for capturing the counts of unordered matches in the document */ + private static class UnorderedMatches { + private final int[] observationsByTag; + private final float[] valuesByWordIdx; + private static final float[] weights; + + private int firstPosition = 1; + private int searchableKeywordCount = 0; + static { + weights = new float[HtmlTag.includedTags.length]; + + for (int i = 0; i < weights.length; i++) { + weights[i] = switch(HtmlTag.includedTags[i]) { + case TITLE -> 2.5f; + case HEADING -> 2.5f; + case ANCHOR -> 0.2f; + case NAV -> 0.1f; + case CODE -> 0.25f; + case BODY -> 1.0f; + default -> 0.0f; + }; + } + } + + public UnorderedMatches(IntList[] positions, CompiledQuery compiledQuery, + BitSet regularMask, + DocumentSpans spans) { + observationsByTag = new int[HtmlTag.includedTags.length]; + valuesByWordIdx = new float[compiledQuery.size()]; + + for (int i = 0; i < compiledQuery.size(); i++) { + + if (positions[i] == null || !regularMask.get(i)) + continue; + + if (positions[i].isEmpty()) continue; + + firstPosition = Math.max(firstPosition, positions[i].getInt(0)); + searchableKeywordCount ++; + + int[] posArray = positions[i].toIntArray(); + for (var tag : HtmlTag.includedTags) { + int cnt = spans.getSpan(tag).countIntersections(posArray); + observationsByTag[tag.ordinal()] += cnt; + valuesByWordIdx[i] += cnt * weights[tag.ordinal()]; + } + } + } + + public int getObservationCount(HtmlTag tag) { + return observationsByTag[tag.ordinal()]; + } + + public float[] getWeightedCounts() { + return valuesByWordIdx; + } + + public int size() { + return valuesByWordIdx.length; + } + } + + + private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { + if (size < 400) { + if (quality < 5) + return 0; + return -quality * rankingParams.qualityPenalty; + } + else { + return -quality * rankingParams.qualityPenalty * 20; + } + } + + private int flagsPenalty(int featureFlags, long docFlags, int size) { + + // Short-circuit for index-service, which does not have the feature flags + if (featureFlags == 0) + return 0; + + double penalty = 0; + + boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); + boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags); + boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags); + + // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site + double largeSiteFactor = 1.; + + if (!isForum && !isWiki && !isDocs && size > 400) { + // Long urls-that-look-like-this tend to be poor search results + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) + penalty += 30.0; + else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit())) + penalty += 30.; + else penalty += 5.; + + largeSiteFactor = 2; + } + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) + penalty += 7.5 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) + penalty += 5.0 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + + if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) + penalty += 2.5 * largeSiteFactor; + + if (isForum || isWiki) { + penalty = Math.min(0, penalty - 2); + } + + return (int) -penalty; + } + + /** Normalize a value to the range 0...15, where 0 is the best possible score + * + * @param value The value to normalize, must be positive or zero + * @param penalty Any negative component of the value + * */ + public static double normalize(double value, double penalty) { + if (value < 0) + value = 0; + + return Math.sqrt((1.0 + 500. + 10 * penalty) / (1.0 + value)); + } + +} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java b/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java deleted file mode 100644 index 0fc4bdc1..00000000 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuationContext.java +++ /dev/null @@ -1,171 +0,0 @@ -package nu.marginalia.index.results; - -import nu.marginalia.api.searchquery.model.compiled.*; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.index.index.CombinedIndexReader; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchParameters; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.model.QueryParams; -import nu.marginalia.index.results.model.QuerySearchTerms; -import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds; -import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.index.query.limit.QueryStrategy; -import nu.marginalia.ranking.results.ResultValuator; - -import javax.annotation.Nullable; -import java.util.List; - -/** This class is responsible for calculating the score of a search result. - * It holds the data required to perform the scoring, as there is strong - * reasons to cache this data, and performs the calculations */ -public class IndexResultValuationContext { - private final CombinedIndexReader index; - private final QueryParams queryParams; - - private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds; - private final QuerySearchTerms searchTerms; - - private final ResultRankingContext rankingContext; - private final ResultValuator searchResultValuator; - private final CompiledQuery compiledQuery; - private final CompiledQueryLong compiledQueryIds; - - public IndexResultValuationContext(IndexMetadataService metadataService, - ResultValuator searchResultValuator, - CombinedDocIdList ids, - StatefulIndex statefulIndex, - ResultRankingContext rankingContext, - SearchParameters params - ) { - this.index = statefulIndex.get(); - this.rankingContext = rankingContext; - this.searchResultValuator = searchResultValuator; - - this.queryParams = params.queryParams; - this.compiledQuery = params.compiledQuery; - this.compiledQueryIds = params.compiledQueryIds; - - this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query); - - this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids, - searchTerms.termIdsAll); - } - - private final long flagsFilterMask = - WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); - - @Nullable - public SearchResultItem calculatePreliminaryScore(long combinedId) { - - long docId = UrlIdCodec.removeRank(combinedId); - - if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId)) - return null; - - long docMetadata = index.getDocumentMetadata(docId); - int htmlFeatures = index.getHtmlFeatures(docId); - - SearchResultItem searchResult = new SearchResultItem(docId, - docMetadata, - htmlFeatures, - hasPrioTerm(combinedId)); - - long[] wordMetas = new long[compiledQuery.size()]; - SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()]; - - for (int i = 0; i < wordMetas.length; i++) { - final long termId = compiledQueryIds.at(i); - final String term = compiledQuery.at(i); - - wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId); - scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]); - } - - - // DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs - // to be able to re-construct its own CompiledQuery for re-ranking the results. This is - // a very flimsy assumption. - searchResult.keywordScores.addAll(List.of(scores)); - - CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - - - boolean allSynthetic = CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isPresent); - int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask)); - int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta))); - - if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) { - return null; - } - - if (flagsCount == 0 && !allSynthetic && positionsCount == 0) - return null; - - double score = searchResultValuator.calculateSearchResultValue( - wordMetasQuery, - docMetadata, - htmlFeatures, - 5000, // use a dummy value here as it's not present in the index - rankingContext, - null); - - if (searchResult.hasPrioTerm) { - score = 0.75 * score; - } - - searchResult.setScore(score); - - return searchResult; - } - - private boolean hasPrioTerm(long combinedId) { - for (var term : searchTerms.termIdsPrio.array()) { - if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) { - return true; - } - } - return false; - } - - private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores, - QueryStrategy queryStrategy) - { - if (queryStrategy == QueryStrategy.AUTO || - queryStrategy == QueryStrategy.SENTENCE || - queryStrategy == QueryStrategy.TOPIC) { - return true; - } - - return CompiledQueryAggregates.booleanAggregate(queryGraphScores, - docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy())); - } - - private boolean meetsQueryStrategyRequirements(long wordMeta, QueryStrategy queryStrategy) { - if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SITE) { - return WordFlags.Site.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_SUBJECT) { - return WordFlags.Subjects.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_TITLE) { - return WordFlags.Title.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_URL) { - return WordFlags.UrlPath.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_DOMAIN) { - return WordFlags.UrlDomain.isPresent(wordMeta); - } - else if (queryStrategy == QueryStrategy.REQUIRE_FIELD_LINK) { - return WordFlags.ExternalLink.isPresent(wordMeta); - } - return true; - } - -} diff --git a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java b/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java deleted file mode 100644 index baecb564..00000000 --- a/code/index/java/nu/marginalia/index/results/IndexResultValuatorService.java +++ /dev/null @@ -1,218 +0,0 @@ -package nu.marginalia.index.results; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import it.unimi.dsi.fastutil.longs.LongSet; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.compiled.CqDataLong; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.SearchResultItem; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.index.model.SearchParameters; -import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.linkdb.docs.DocumentDbReader; -import nu.marginalia.linkdb.model.DocdbUrlDetail; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.ResultValuator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.sql.SQLException; -import java.util.*; -import java.util.function.Consumer; - -@Singleton -public class IndexResultValuatorService { - private static final Logger logger = LoggerFactory.getLogger(IndexResultValuatorService.class); - - private final IndexMetadataService metadataService; - private final DocumentDbReader documentDbReader; - private final ResultValuator resultValuator; - private final StatefulIndex statefulIndex; - - @Inject - public IndexResultValuatorService(IndexMetadataService metadataService, - DocumentDbReader documentDbReader, - ResultValuator resultValuator, - StatefulIndex statefulIndex) - { - this.metadataService = metadataService; - this.documentDbReader = documentDbReader; - this.resultValuator = resultValuator; - this.statefulIndex = statefulIndex; - } - - public List rankResults(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) - { - final var evaluator = createValuationContext(params, rankingContext, resultIds); - - List results = new ArrayList<>(resultIds.size()); - - for (long id : resultIds.array()) { - var score = evaluator.calculatePreliminaryScore(id); - if (score != null) { - results.add(score); - } - } - - return results; - } - - private IndexResultValuationContext createValuationContext(SearchParameters params, - ResultRankingContext rankingContext, - CombinedDocIdList resultIds) - { - return new IndexResultValuationContext(metadataService, - resultValuator, - resultIds, - statefulIndex, - rankingContext, - params); - } - - - public List selectBestResults(SearchParameters params, - ResultRankingContext rankingContext, - Collection results) throws SQLException { - - var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain); - - List resultsList = new ArrayList<>(results.size()); - - for (var item : results) { - if (domainCountFilter.test(item)) { - // It's important that this filter runs across all results, not just the top N - if (resultsList.size() < params.limitTotal) { - resultsList.add(item); - } - } - } - - for (var item : resultsList) { - item.resultsFromDomain = domainCountFilter.getCount(item); - } - - return decorateAndRerank(resultsList, params.compiledQuery, rankingContext); - } - - /** Decorate the result items with additional information from the link database - * and calculate an updated ranking with the additional information */ - public List decorateAndRerank(List rawResults, - CompiledQuery compiledQuery, - ResultRankingContext rankingContext) - throws SQLException - { - TLongList idsList = new TLongArrayList(rawResults.size()); - - for (var result : rawResults) - idsList.add(result.getDocumentId()); - - Map urlDetailsById = new HashMap<>(rawResults.size()); - - for (var item : documentDbReader.getUrlDetails(idsList)) - urlDetailsById.put(item.urlId(), item); - - List resultItems = new ArrayList<>(rawResults.size()); - for (var result : rawResults) { - var id = result.getDocumentId(); - var docData = urlDetailsById.get(id); - - if (docData == null) { - logger.warn("No document data for id {}", id); - continue; - } - - // Reconstruct the compiledquery for re-valuation - // - // CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same - // order as the data for the CompiledQuery. - long[] wordMetas = new long[compiledQuery.size()]; - - for (int i = 0; i < compiledQuery.size(); i++) { - var score = result.keywordScores.get(i); - wordMetas[i] = score.encodedWordMetadata(); - } - - CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas)); - - resultItems.add(createCombinedItem( - result, - docData, - metaQuery, - rankingContext)); - } - return resultItems; - } - - private DecoratedSearchResultItem createCombinedItem(SearchResultItem result, - DocdbUrlDetail docData, - CompiledQueryLong wordMetas, - ResultRankingContext rankingContext) { - - ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor(); - Consumer detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null; - - double score = resultValuator.calculateSearchResultValue(wordMetas, - result.encodedDocMetadata, - result.htmlFeatures, - docData.wordsTotal(), - rankingContext, - detailConsumer); - - return new DecoratedSearchResultItem( - result, - docData.url(), - docData.title(), - docData.description(), - docData.urlQuality(), - docData.format(), - docData.features(), - docData.pubYear(), - docData.dataHash(), - docData.wordsTotal(), - bestPositions(wordMetas), - score, - detailsExtractor.get() - ); - } - - private static class ResultRankingDetailsExtractor { - private ResultRankingDetails value = null; - - public ResultRankingDetails get() { - return value; - } - public void set(ResultRankingDetails value) { - this.value = value; - } - } - - private long bestPositions(CompiledQueryLong wordMetas) { - LongSet positionsSet = CompiledQueryAggregates.positionsAggregate(wordMetas, WordMetadata::decodePositions); - - int bestPc = 0; - long bestPositions = 0; - - var li = positionsSet.longIterator(); - - while (li.hasNext()) { - long pos = li.nextLong(); - int pc = Long.bitCount(pos); - if (pc > bestPc) { - bestPc = pc; - bestPositions = pos; - } - } - - return bestPositions; - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java b/code/index/java/nu/marginalia/index/results/TermFlagsGraphVisitor.java similarity index 84% rename from code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java rename to code/index/java/nu/marginalia/index/results/TermFlagsGraphVisitor.java index 1fb26f6b..e4255a5e 100644 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25PrioGraphVisitor.java +++ b/code/index/java/nu/marginalia/index/results/TermFlagsGraphVisitor.java @@ -1,4 +1,4 @@ -package nu.marginalia.ranking.results.factors; +package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.compiled.CqDataInt; import nu.marginalia.api.searchquery.model.compiled.CqDataLong; @@ -6,23 +6,25 @@ import nu.marginalia.api.searchquery.model.compiled.CqExpression; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import java.util.List; -public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { +public class TermFlagsGraphVisitor implements CqExpression.DoubleVisitor { private static final long AVG_LENGTH = 5000; private final CqDataLong wordMetaData; private final CqDataInt frequencies; + private final float[] counts; private final Bm25Parameters bm25Parameters; private final int docCount; - public Bm25PrioGraphVisitor(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - ResultRankingContext ctx) { + public TermFlagsGraphVisitor(Bm25Parameters bm25Parameters, + CqDataLong wordMetaData, + float[] counts, + ResultRankingContext ctx) { this.bm25Parameters = bm25Parameters; + this.counts = counts; this.docCount = ctx.termFreqDocCount(); this.wordMetaData = wordMetaData; this.frequencies = ctx.fullCounts; @@ -48,7 +50,7 @@ public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { @Override public double onLeaf(int idx) { - double count = evaluatePriorityScore(wordMetaData.get(idx)); + double count = evaluatePriorityScore(idx); int freq = frequencies.get(idx); @@ -56,8 +58,9 @@ public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { return invFreq(docCount, freq) * f(bm25Parameters.k(), 0, count, 0); } - private static double evaluatePriorityScore(long wordMeta) { - int pcount = Long.bitCount(WordMetadata.decodePositions(wordMeta)); + private double evaluatePriorityScore(int idx) { + byte wordMeta = (byte) wordMetaData.get(idx); + float pcount = counts[idx]; double qcount = 0.; @@ -95,8 +98,6 @@ public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { qcount += 1.25; if ((wordMeta & WordFlags.NamesWords.asBit()) != 0) qcount += 0.25; - if ((wordMeta & WordFlags.TfIdfHigh.asBit()) != 0) - qcount += 0.5; } return qcount; @@ -124,4 +125,4 @@ public class Bm25PrioGraphVisitor implements CqExpression.DoubleVisitor { return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); } -} +} \ No newline at end of file diff --git a/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java b/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java new file mode 100644 index 00000000..11cce9a4 --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/model/PhraseConstraintGroupList.java @@ -0,0 +1,183 @@ +package nu.marginalia.index.results.model; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.index.model.SearchTermsUtil; +import nu.marginalia.index.results.model.ids.TermIdList; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.SequenceOperations; + +import java.util.ArrayList; +import java.util.BitSet; +import java.util.Collections; +import java.util.List; + +/** + * wordIds that we require to be in the same sentence + */ +public class PhraseConstraintGroupList { + /** A list of groups representing parts of the query that must be present in the specified order */ + private final List mandatoryGroups = new ArrayList<>(); + + /** A list of groups representing segments of the query */ + private final List optionalGroups = new ArrayList<>(); + + /** A group representing all terms in the query, segmentation be damned */ + private final PhraseConstraintGroup fullGroup; + + public PhraseConstraintGroupList( + PhraseConstraintGroup fullGroup, + List mandatoryGroups, + List optionalGroups) { + this.mandatoryGroups.addAll(mandatoryGroups); + this.optionalGroups.addAll(optionalGroups); + this.fullGroup = fullGroup; + } + + public List getOptionalGroups() { + return Collections.unmodifiableList(optionalGroups); + } + + public PhraseConstraintGroup getFullGroup() { + return fullGroup; + } + + public boolean testMandatory(CodedSequence[] positions) { + + for (var constraint : mandatoryGroups) { + if (!constraint.test(positions)) { + return false; + } + } + + return true; + } + + public static final class PhraseConstraintGroup { + private final int[] offsets; + private final BitSet present; + private final BitSet termIdsMask; + + public final int size; + public PhraseConstraintGroup(List terms, TermIdList termIdsAll) { + offsets = new int[terms.size()]; + present = new BitSet(terms.size()); + size = terms.size(); + + termIdsMask = new BitSet(termIdsAll.size()); + + int i = 0; + for (String term : terms) { + if (term.isEmpty()) { + continue; + } + + present.set(i); + long termId = SearchTermsUtil.getWordId(term); + + int idx = termIdsAll.indexOf(termId); + if (idx < 0) { + offsets[i++] = -1; + } + else { + offsets[i++] = idx; + termIdsMask.set(idx); + } + } + } + + /** Returns true if the term with index termIdx in the query is in the group */ + public boolean containsTerm(int termIdx) { + return termIdsMask.get(termIdx); + } + + public boolean test(CodedSequence[] positions) { + IntIterator[] sequences = new IntIterator[present.cardinality()]; + + for (int oi = 0, si = 0; oi < offsets.length; oi++) { + if (!present.get(oi)) { + continue; + } + int offset = offsets[oi]; + if (offset < 0) + return false; + + // Create iterators that are offset by their relative position in the + // sequence. This is done by subtracting the index from the offset, + // so that when we intersect them, an overlap means that the terms are + // in the correct order. Note the offset is negative! + + var posForTerm = positions[offset]; + if (posForTerm == null) { + return false; + } + sequences[si++] = posForTerm.offsetIterator(-oi); + } + + return SequenceOperations.intersectSequences(sequences); + } + + + public IntList findIntersections(IntList[] positions) { + IntList[] sequences = new IntList[present.cardinality()]; + int[] iterOffsets = new int[sequences.length]; + + for (int oi = 0, si = 0; oi < offsets.length; oi++) { + if (!present.get(oi)) { + continue; + } + int offset = offsets[oi]; + if (offset < 0) + return IntList.of(); + + // Create iterators that are offset by their relative position in the + // sequence. This is done by subtracting the index from the offset, + // so that when we intersect them, an overlap means that the terms are + // in the correct order. Note the offset is negative! + + var posForTerm = positions[offset]; + if (posForTerm == null) { + return IntList.of(); + } + sequences[si++] = posForTerm; + iterOffsets[si - 1] = -oi; + } + + return SequenceOperations.findIntersections(sequences, iterOffsets); + } + + public int minDistance(IntList[] positions) { + List sequences = new ArrayList<>(present.cardinality()); + IntList iterOffsets = new IntArrayList(present.cardinality()); + + for (int oi = 0; oi < offsets.length; oi++) { + if (!present.get(oi)) { + continue; + } + int offset = offsets[oi]; + if (offset < 0) + return Integer.MAX_VALUE; + + // Create iterators that are offset by their relative position in the + // sequence. This is done by subtracting the index from the offset, + // so that when we intersect them, an overlap means that the terms are + // in the correct order. Note the offset is negative! + + var posForTerm = positions[offset]; + if (posForTerm == null) { + return Integer.MAX_VALUE; + } + + if (posForTerm.size() > 16) { // heuristic to avoid large sequences, which is expensive and not very useful + continue; + } + + sequences.add(posForTerm); + iterOffsets.add(-oi); + } + + return SequenceOperations.minDistance(sequences.toArray(IntList[]::new), iterOffsets.toIntArray()); + } + } +} diff --git a/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java b/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java index bbb7cf30..d41ea5e3 100644 --- a/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java +++ b/code/index/java/nu/marginalia/index/results/model/QuerySearchTerms.java @@ -6,18 +6,15 @@ import nu.marginalia.index.results.model.ids.TermIdList; public class QuerySearchTerms { private final TObjectLongHashMap termToId; public final TermIdList termIdsAll; - public final TermIdList termIdsPrio; - public final TermCoherenceGroupList coherences; + public final PhraseConstraintGroupList phraseConstraints; public QuerySearchTerms(TObjectLongHashMap termToId, TermIdList termIdsAll, - TermIdList termIdsPrio, - TermCoherenceGroupList coherences) { + PhraseConstraintGroupList phraseConstraints) { this.termToId = termToId; this.termIdsAll = termIdsAll; - this.termIdsPrio = termIdsPrio; - this.coherences = coherences; + this.phraseConstraints = phraseConstraints; } public long getIdForTerm(String searchTerm) { diff --git a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java deleted file mode 100644 index 67b5fd60..00000000 --- a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java +++ /dev/null @@ -1,54 +0,0 @@ -package nu.marginalia.index.results.model; - -import nu.marginalia.index.model.SearchTermsUtil; -import nu.marginalia.model.idx.WordMetadata; - -import java.util.Collections; -import java.util.List; - -/** - * wordIds that we require to be in the same sentence - */ -public record TermCoherenceGroupList(List words) { - - public TermCoherenceGroupList(List words) { - this.words = Collections.unmodifiableList(words); - } - - public boolean test(TermMetadataForCombinedDocumentIds documents, long combinedId) { - for (var coherenceSet : words()) { - if (!coherenceSet.test(documents, combinedId)) { - return false; - } - } - - return true; - } - - public static final class TermCoherenceGroup { - private final long[] words; - - public TermCoherenceGroup(long[] words) { - this.words = words; - } - - public TermCoherenceGroup(List coh) { - this(coh.stream().mapToLong(SearchTermsUtil::getWordId).toArray()); - } - - public boolean test(TermMetadataForCombinedDocumentIds documents, long combinedId) { - long overlap = 0xFF_FFFF_FFFF_FFFFL; - - for (var word : words) { - long meta = documents.getTermMetadata(word, combinedId); - - // if the word is not present in the document, we omit it from the coherence check - if (meta != 0L) { - overlap &= meta; - } - } - - return WordMetadata.decodePositions(overlap) != 0L; - } - } -} diff --git a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java index 3ef2f7ab..bf111386 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java +++ b/code/index/java/nu/marginalia/index/results/model/TermMetadataForCombinedDocumentIds.java @@ -1,26 +1,38 @@ package nu.marginalia.index.results.model; -import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap; +import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; +import nu.marginalia.index.positions.TermData; import nu.marginalia.index.results.model.ids.CombinedDocIdList; -import nu.marginalia.index.results.model.ids.DocMetadataList; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import nu.marginalia.index.results.model.ids.TermMetadataList; +import nu.marginalia.sequence.CodedSequence; + +import javax.annotation.Nullable; public class TermMetadataForCombinedDocumentIds { - private static final Logger logger = LoggerFactory.getLogger(TermMetadataForCombinedDocumentIds.class); private final Long2ObjectArrayMap termdocToMeta; public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap termdocToMeta) { this.termdocToMeta = termdocToMeta; } - public long getTermMetadata(long termId, long combinedId) { + public byte getTermMetadata(long termId, long combinedId) { var metaByCombinedId = termdocToMeta.get(termId); if (metaByCombinedId == null) { return 0; } - return metaByCombinedId.get(combinedId); + return metaByCombinedId.get(combinedId).flags(); + } + + @Nullable + public CodedSequence getPositions(long termId, long combinedId) { + var metaByCombinedId = termdocToMeta.get(termId); + + if (metaByCombinedId == null) { + return null; + } + + return metaByCombinedId.get(combinedId).positions(); } public boolean hasTermMeta(long termId, long combinedId) { @@ -30,16 +42,25 @@ public class TermMetadataForCombinedDocumentIds { return false; } - return metaByCombinedId.get(combinedId) != 0; + return metaByCombinedId.data().containsKey(combinedId); } - public record DocumentsWithMetadata(Long2LongOpenHashMap data) { - public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) { - this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array())); + public record DocumentsWithMetadata(Long2ObjectOpenHashMap data) { + public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) { + this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size())); + + long[] ids = combinedDocIdsAll.array(); + TermData[] data = metadata.array(); + + for (int i = 0; i < combinedDocIdsAll.size(); i++) { + if (data[i] != null) { + this.data.put(ids[i], data[i]); + } + } } - public long get(long combinedId) { - return data.getOrDefault(combinedId, 0); + public TermData get(long combinedId) { + return data.get(combinedId); } } } diff --git a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java index 17bd17a1..43f5c575 100644 --- a/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java +++ b/code/index/java/nu/marginalia/index/results/model/ids/CombinedDocIdList.java @@ -15,6 +15,10 @@ import java.util.stream.LongStream; public final class CombinedDocIdList { private final long[] data; + public CombinedDocIdList(long... data) { + this.data = Arrays.copyOf(data, data.length); + } + public CombinedDocIdList(LongArrayList data) { this.data = data.toLongArray(); } @@ -28,6 +32,7 @@ public final class CombinedDocIdList { public int size() { return data.length; } + public long at(int i) { return data[i]; } public LongStream stream() { return Arrays.stream(data); diff --git a/code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java b/code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java deleted file mode 100644 index 0104f89c..00000000 --- a/code/index/java/nu/marginalia/index/results/model/ids/DocMetadataList.java +++ /dev/null @@ -1,45 +0,0 @@ -package nu.marginalia.index.results.model.ids; - -import it.unimi.dsi.fastutil.longs.LongArrayList; - -import java.util.Arrays; -import java.util.Objects; -import java.util.stream.LongStream; - -public final class DocMetadataList { - private final long[] array; - - public DocMetadataList(long[] array) { - this.array = array; - } - - public DocMetadataList(LongArrayList list) { - this(list.toLongArray()); - } - - public int size() { - return array.length; - } - - public LongStream stream() { - return LongStream.of(array); - } - - public long[] array() { - return array; - } - - @Override - public boolean equals(Object obj) { - if (obj == this) return true; - if (obj == null || obj.getClass() != this.getClass()) return false; - var that = (DocMetadataList) obj; - return Arrays.equals(this.array, that.array); - } - - @Override - public int hashCode() { - return Arrays.hashCode(array); - } - -} diff --git a/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java b/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java index 1428ec38..9737761c 100644 --- a/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java +++ b/code/index/java/nu/marginalia/index/results/model/ids/TermIdList.java @@ -43,7 +43,12 @@ public final class TermIdList { } public int indexOf(long id) { - return Arrays.binarySearch(array, id); + for (int i = 0; i < array.length; i++) { + if (array[i] == id) { + return i; + } + } + return -1; } @Override diff --git a/code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java b/code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java new file mode 100644 index 00000000..7342aaa6 --- /dev/null +++ b/code/index/java/nu/marginalia/index/results/model/ids/TermMetadataList.java @@ -0,0 +1,55 @@ +package nu.marginalia.index.results.model.ids; + +import nu.marginalia.index.positions.TermData; +import nu.marginalia.sequence.CodedSequence; + +import javax.annotation.Nullable; +import java.util.Arrays; + +public final class TermMetadataList { + private final TermData[] array; + + public TermMetadataList(TermData[] array) { + this.array = array; + } + + public int size() { + return array.length; + } + + public long flag(int i) { + if (array[i] == null) + return 0; + + return array[i].flags(); + } + + /** Returns the position data for the given document index, + * may be null if the term is not in the document + */ + @Nullable + public CodedSequence position(int i) { + if (array[i] == null) + return null; + + return array[i].positions(); + } + + public TermData[] array() { + return array; + } + + @Override + public boolean equals(Object obj) { + if (obj == this) return true; + if (obj == null || obj.getClass() != this.getClass()) return false; + var that = (TermMetadataList) obj; + return Arrays.equals(this.array, that.array); + } + + @Override + public int hashCode() { + return Arrays.hashCode(array); + } + +} diff --git a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java b/code/index/java/nu/marginalia/ranking/results/ResultValuator.java deleted file mode 100644 index 1e026b40..00000000 --- a/code/index/java/nu/marginalia/ranking/results/ResultValuator.java +++ /dev/null @@ -1,207 +0,0 @@ -package nu.marginalia.ranking.results; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingDetails; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingInputs; -import nu.marginalia.api.searchquery.model.results.debug.ResultRankingOutputs; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.ranking.results.factors.*; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import javax.annotation.Nullable; -import java.util.function.Consumer; - -@Singleton -public class ResultValuator { - final static double scalingFactor = 500.; - - private final TermCoherenceFactor termCoherenceFactor; - - private static final Logger logger = LoggerFactory.getLogger(ResultValuator.class); - - @Inject - public ResultValuator(TermCoherenceFactor termCoherenceFactor) { - this.termCoherenceFactor = termCoherenceFactor; - } - - public double calculateSearchResultValue(CompiledQueryLong wordMeta, - long documentMetadata, - int features, - int length, - ResultRankingContext ctx, - @Nullable Consumer detailsConsumer - ) - { - if (wordMeta.isEmpty()) - return Double.MAX_VALUE; - - if (length < 0) { - length = 5000; - } - - var rankingParams = ctx.params; - - int rank = DocumentMetadata.decodeRank(documentMetadata); - int asl = DocumentMetadata.decodeAvgSentenceLength(documentMetadata); - int quality = DocumentMetadata.decodeQuality(documentMetadata); - int size = DocumentMetadata.decodeSize(documentMetadata); - int flagsPenalty = flagsPenalty(features, documentMetadata & 0xFF, size); - int topology = DocumentMetadata.decodeTopology(documentMetadata); - int year = DocumentMetadata.decodeYear(documentMetadata); - - double averageSentenceLengthPenalty = (asl >= rankingParams.shortSentenceThreshold ? 0 : -rankingParams.shortSentencePenalty); - - final double qualityPenalty = calculateQualityPenalty(size, quality, rankingParams); - final double rankingBonus = (255. - rank) * rankingParams.domainRankBonus; - final double topologyBonus = Math.log(1 + topology); - final double documentLengthPenalty = length > rankingParams.shortDocumentThreshold ? 0 : -rankingParams.shortDocumentPenalty; - final double temporalBias; - - if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.RECENT) { - temporalBias = - Math.abs(year - PubDate.MAX_YEAR) * rankingParams.temporalBiasWeight; - } else if (rankingParams.temporalBias == ResultRankingParameters.TemporalBias.OLD) { - temporalBias = - Math.abs(year - PubDate.MIN_YEAR) * rankingParams.temporalBiasWeight; - } else { - temporalBias = 0; - } - - double overallPart = averageSentenceLengthPenalty - + documentLengthPenalty - + qualityPenalty - + rankingBonus - + topologyBonus - + temporalBias - + flagsPenalty; - - double tcfOverlap = rankingParams.tcfOverlapWeight * termCoherenceFactor.calculateOverlap(wordMeta); - double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx); - - double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bM25N = rankingParams.bm25NgramWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx)); - double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx)); - - double overallPartPositive = Math.max(0, overallPart); - double overallPartNegative = -Math.min(0, overallPart); - - if (null != detailsConsumer) { - var details = new ResultRankingDetails( - new ResultRankingInputs( - rank, - asl, - quality, - size, - topology, - year, - DocumentFlags.decode(documentMetadata).stream().map(Enum::name).toList() - ), - new ResultRankingOutputs( - averageSentenceLengthPenalty, - qualityPenalty, - rankingBonus, - topologyBonus, - documentLengthPenalty, - temporalBias, - flagsPenalty, - overallPart, - tcfOverlap, - tcfJaccard, - bM25F, - bM25N, - bM25P) - ); - - detailsConsumer.accept(details); - } - - // Renormalize to 0...15, where 0 is the best possible score; - // this is a historical artifact of the original ranking function - double ret = normalize( - tcfOverlap + tcfJaccard - + bM25F + bM25P + bM25N - + overallPartPositive, - overallPartNegative); - - if (Double.isNaN(ret)) { - if (getClass().desiredAssertionStatus()) { - throw new IllegalStateException("NaN in result value calculation"); - } - - return Double.MAX_VALUE; - } - else { - return ret; - } - } - - private double calculateQualityPenalty(int size, int quality, ResultRankingParameters rankingParams) { - if (size < 400) { - if (quality < 5) - return 0; - return -quality * rankingParams.qualityPenalty; - } - else { - return -quality * rankingParams.qualityPenalty * 20; - } - } - - private int flagsPenalty(int featureFlags, long docFlags, int size) { - - // Short-circuit for index-service, which does not have the feature flags - if (featureFlags == 0) - return 0; - - double penalty = 0; - - boolean isForum = DocumentFlags.GeneratorForum.isPresent(docFlags); - boolean isWiki = DocumentFlags.GeneratorWiki.isPresent(docFlags); - boolean isDocs = DocumentFlags.GeneratorDocs.isPresent(docFlags); - - // Penalize large sites harder for any bullshit as it's a strong signal of a low quality site - double largeSiteFactor = 1.; - - if (!isForum && !isWiki && !isDocs && size > 400) { - // Long urls-that-look-like-this tend to be poor search results - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.KEBAB_CASE_URL.getFeatureBit())) - penalty += 30.0; - else if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.LONG_URL.getFeatureBit())) - penalty += 30.; - else penalty += 5.; - - largeSiteFactor = 2; - } - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING_ADTECH.getFeatureBit())) - penalty += 7.5 * largeSiteFactor; - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.AFFILIATE_LINK.getFeatureBit())) - penalty += 5.0 * largeSiteFactor; - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.COOKIES.getFeatureBit())) - penalty += 2.5 * largeSiteFactor; - - if (DocumentMetadata.hasFlags(featureFlags, HtmlFeature.TRACKING.getFeatureBit())) - penalty += 2.5 * largeSiteFactor; - - if (isForum || isWiki) { - penalty = Math.min(0, penalty - 2); - } - - return (int) -penalty; - } - - public static double normalize(double value, double penalty) { - if (value < 0) - value = 0; - - return Math.sqrt((1.0 + scalingFactor + 10 * penalty) / (1.0 + value)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java b/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java deleted file mode 100644 index 4105ed6b..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/Bm25FullGraphVisitor.java +++ /dev/null @@ -1,104 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.compiled.CqDataLong; -import nu.marginalia.api.searchquery.model.compiled.CqExpression; -import nu.marginalia.api.searchquery.model.results.Bm25Parameters; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.model.idx.WordMetadata; - -import java.util.BitSet; -import java.util.List; - -public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor { - private static final long AVG_LENGTH = 5000; - - private final CqDataLong wordMetaData; - private final CqDataInt frequencies; - private final Bm25Parameters bm25Parameters; - - private final int docCount; - private final int length; - - private final BitSet mask; - - private Bm25FullGraphVisitor(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - int length, - BitSet mask, - ResultRankingContext ctx) { - this.length = length; - this.bm25Parameters = bm25Parameters; - this.docCount = ctx.termFreqDocCount(); - this.wordMetaData = wordMetaData; - this.frequencies = ctx.fullCounts; - this.mask = mask; - } - - public static Bm25FullGraphVisitor forRegular(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - int length, - ResultRankingContext ctx) { - return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.regularMask, ctx); - } - - public static Bm25FullGraphVisitor forNgrams(Bm25Parameters bm25Parameters, - CqDataLong wordMetaData, - int length, - ResultRankingContext ctx) { - return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.ngramsMask, ctx); - } - - @Override - public double onAnd(List parts) { - double value = 0; - for (var part : parts) { - value += part.visit(this); - } - return value; - } - - @Override - public double onOr(List parts) { - double value = 0; - for (var part : parts) { - value = Math.max(value, part.visit(this)); - } - return value; - } - - @Override - public double onLeaf(int idx) { - if (!mask.get(idx)) { - return 0; - } - - double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx))); - - int freq = frequencies.get(idx); - - return invFreq(docCount, freq) * f(bm25Parameters.k(), bm25Parameters.b(), count, length); - } - - /** - * - * @param docCount Number of documents - * @param freq Number of matching documents - */ - private double invFreq(int docCount, int freq) { - return Math.log(1.0 + (docCount - freq + 0.5) / (freq + 0.5)); - } - - /** - * - * @param k determines the size of the impact of a single term - * @param b determines the magnitude of the length normalization - * @param count number of occurrences in the document - * @param length document length - */ - private double f(double k, double b, double count, int length) { - final double lengthRatio = (double) length / AVG_LENGTH; - - return (count * (k + 1)) / (count + k * (1 - b + b * lengthRatio)); - } -} diff --git a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java b/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java deleted file mode 100644 index 3bda0580..00000000 --- a/code/index/java/nu/marginalia/ranking/results/factors/TermCoherenceFactor.java +++ /dev/null @@ -1,82 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.model.idx.WordMetadata; - -/** Rewards documents where terms appear frequently within the same sentences - */ -public class TermCoherenceFactor { - - /** Calculate a factor that rewards the best total position overlap - * between the terms in the query. This is high when all the terms - * found in the same sentences. - */ - public double calculateOverlap(CompiledQueryLong wordMetadataQuery) { - if (wordMetadataQuery.size() < 2) - return 0; - - long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery, - score -> score >>> WordMetadata.POSITIONS_SHIFT); - - return bitsSetFactor(mask); - } - - /** Calculate a factor that rewards the best average mutual Jaccard index - * between the terms in the query. This is high when the several terms are frequently - * found in the same sentences. - */ - public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) { - double sum = 0; - int cnt = 0; - - for (int i = 0; i < wordMetadataQuery.size(); i++) { - - // Skip terms that are not in the regular mask - if (!ctx.regularMask.get(i)) - continue; - - long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i)); - - // Skip terms that are not in the document - if (imask == 0L) - continue; - - for (int j = i + 1; j < wordMetadataQuery.size(); j++) { - - // Skip terms that are not in the regular mask - if (!ctx.regularMask.get(j)) - continue; - - long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j)); - - // Skip terms that are not in the document - if (jmask == 0L) - continue; - - long quot = Long.bitCount(imask & jmask); - long rem = Long.bitCount(imask | jmask); - - // rem is always > 0 because imask and jmask are not both 0 - - sum += quot/(double) rem; - cnt++; - } - } - - if (cnt > 0) { - return sum / cnt; - } else { - return 0; - } - } - - double bitsSetFactor(long mask) { - final int bitsSetInMask = Long.bitCount(mask); - - return Math.pow(bitsSetInMask/(double) WordMetadata.POSITIONS_COUNT, 0.25); - } - - -} \ No newline at end of file diff --git a/code/index/query/java/nu/marginalia/index/query/EntrySource.java b/code/index/query/java/nu/marginalia/index/query/EntrySource.java index 4b0f6405..166440f0 100644 --- a/code/index/query/java/nu/marginalia/index/query/EntrySource.java +++ b/code/index/query/java/nu/marginalia/index/query/EntrySource.java @@ -6,6 +6,7 @@ import nu.marginalia.array.page.LongQueryBuffer; */ public interface EntrySource { /** Skip n entries. */ + @Deprecated void skip(int n); /** Fill the buffer with entries, updating its data and length appropriately. */ diff --git a/code/index/query/java/nu/marginalia/index/query/IndexQuery.java b/code/index/query/java/nu/marginalia/index/query/IndexQuery.java index 81136e91..52caed8e 100644 --- a/code/index/query/java/nu/marginalia/index/query/IndexQuery.java +++ b/code/index/query/java/nu/marginalia/index/query/IndexQuery.java @@ -73,8 +73,6 @@ public class IndexQuery { private boolean fillBuffer(LongQueryBuffer dest) { for (;;) { - dest.reset(); - EntrySource source = sources.get(si); source.read(dest); diff --git a/code/index/query/readme.md b/code/index/query/readme.md index 7386339c..b733b376 100644 --- a/code/index/query/readme.md +++ b/code/index/query/readme.md @@ -19,4 +19,4 @@ interfaces are implemented within the index-service module. * [index/index-reverse](../index-reverse) implements many of these interfaces. * [libraries/array](../../libraries/array) -* [libraries/array/.../LongQueryBuffer](../../libraries/array/java/nu/marginalia/array/buffer/LongQueryBuffer.java) \ No newline at end of file +* [libraries/array/.../LongQueryBuffer](../../libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java) \ No newline at end of file diff --git a/code/index/readme.md b/code/index/readme.md index 6a819e0f..b730ae75 100644 --- a/code/index/readme.md +++ b/code/index/readme.md @@ -32,7 +32,8 @@ results higher. ## Central Classes -* [ResultValuator](java/nu/marginalia/ranking/results/ResultValuator.java) +* [IndexResultRankingService](java/nu/marginalia/index/results/IndexResultRankingService.java) +* [IndexResultScoreCalculator](java/nu/marginalia/index/results/IndexResultScoreCalculator.java) --- diff --git a/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java new file mode 100644 index 00000000..c0f4bd8b --- /dev/null +++ b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java @@ -0,0 +1,383 @@ +package nu.marginalia.index; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.IndexLocations; +import nu.marginalia.array.page.LongQueryBuffer; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; +import nu.marginalia.index.index.CombinedIndexReader; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; +import nu.marginalia.index.positions.TermData; +import nu.marginalia.index.results.model.ids.CombinedDocIdList; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; +import nu.marginalia.linkdb.model.DocdbUrlDetail; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.sequence.VarintCodedSequence; +import nu.marginalia.service.server.Initialization; +import nu.marginalia.storage.FileStorageService; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; + +import java.io.IOException; +import java.lang.foreign.Arena; +import java.net.URISyntaxException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; + +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Execution(SAME_THREAD) +public class CombinedIndexReaderTest { + + @Inject + Initialization initialization; + + IndexQueryServiceIntegrationTestModule testModule; + + @Inject + StatefulIndex statefulIndex; + + @Inject + IndexJournalSlopWriter indexJournalWriter; + + @Inject + FileStorageService fileStorageService; + + @Inject + DomainRankings domainRankings; + + @Inject + ProcessHeartbeat processHeartbeat; + @Inject + DocumentDbReader documentDbReader; + + @Inject + IndexFactory indexFactory; + + @BeforeEach + public void setUp() throws IOException { + + testModule = new IndexQueryServiceIntegrationTestModule(); + Guice.createInjector(testModule).injectMembers(this); + + initialization.setReady(); + } + + @AfterEach + public void tearDown() throws IOException { + testModule.cleanUp(); + } + + private final MockDocumentMeta anyMetadata = new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))); + + @Test + public void testSimpleRetrieval() throws Exception { + new MockData().add( + d(1, 1), + anyMetadata, + w("hello", WordFlags.Title, 33, 55), + w("world", WordFlags.Subjects, 34) + ).load(); + + var reader = indexFactory.getCombinedIndexReader(); + var query = reader.findFullWord(kw("hello")).build(); + + var buffer = new LongQueryBuffer(32); + query.getMoreResults(buffer); + + assertEquals( + List.of(d(1, 1)), + decode(buffer) + ); + + var helloMeta = td(reader, kw("hello"), d(1, 1)); + assertEquals(helloMeta.flags(), WordFlags.Title.asBit()); + assertEquals(IntList.of(33, 55), helloMeta.positions().values()); + + var worldMeta = td(reader, kw("world"), d(1, 1)); + assertEquals(worldMeta.flags(), WordFlags.Subjects.asBit()); + assertEquals(IntList.of(34), worldMeta.positions().values()); + } + + TermData td(CombinedIndexReader reader, long wordId, MockDataDocument docId) { + return (reader.getTermMetadata(Arena.global(), wordId, new CombinedDocIdList(docId.docId())).array())[0]; + } + + + @Test + public void testUnionRetrieval() throws Exception { + new MockData() + .add( + d(1, 1), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ) + .add( + d(1, 2), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(1, 3), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(2, 4), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ) + .load(); + + var reader = indexFactory.getCombinedIndexReader(); + var query = reader + .findFullWord(kw("hello")) + .also(kw("world")) + .build(); + + var buffer = new LongQueryBuffer(32); + query.getMoreResults(buffer); + + assertEquals( + List.of(d(1, 1), d(2, 4)), + decode(buffer) + ); + } + + @Test + public void testNotFilterRetrieval() throws Exception { + new MockData() + .add( + d(1, 1), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title), + w("goodbye", WordFlags.Title) + ) + .add( + d(1, 2), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(1, 3), + anyMetadata, + w("world", WordFlags.Title) + ) + .add( + d(2, 4), + anyMetadata, + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ) + .load(); + + var reader = indexFactory.getCombinedIndexReader(); + var query = reader.findFullWord(kw("hello")) + .also(kw("world")) + .not(kw("goodbye")) + .build(); + + var buffer = new LongQueryBuffer(32); + query.getMoreResults(buffer); + + assertEquals( + List.of(d(2, 4)), + decode(buffer) + ); + } + + List decode(LongQueryBuffer buffer) { + List result = new ArrayList<>(); + for (int i = 0; i < buffer.size(); i++) { + result.add(new MockDataDocument(buffer.data.get(i))); + } + return result; + } + + private MockDataDocument d(int domainId, int ordinal) { + return new MockDataDocument(domainId, ordinal); + } + + private void constructIndex() throws IOException { + createForwardIndex(); + createFullReverseIndex(); + createPrioReverseIndex(); + } + + private void createFullReverseIndex() throws IOException { + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path tmpDir = workDir.resolve("tmp"); + + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + var constructor = + new FullIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + DocIdRewriter.identity(), + tmpDir); + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + } + + private void createPrioReverseIndex() throws IOException { + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path tmpDir = workDir.resolve("tmp"); + + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + var constructor = new FullIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + DocIdRewriter.identity(), + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + } + + private void createForwardIndex() throws IOException { + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + + ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, + outputFileDocsId, + outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), + domainRankings + ); + + converter.convert(); + } + + MurmurHash3_128 hasher = new MurmurHash3_128(); + + long kw(String s) { + return hasher.hashKeyword(s); + } + + class MockData { + private final Map> allData = new HashMap<>(); + private final Map metaByDoc = new HashMap<>(); + + public MockData add(MockDataDocument document, + MockDocumentMeta meta, + MockDataKeyword... words) + { + long id = UrlIdCodec.encodeId(document.domainId, document.ordinal); + + allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words)); + metaByDoc.put(id, meta); + + return this; + } + + void load() throws IOException, SQLException, URISyntaxException { + allData.forEach((doc, words) -> { + + var meta = metaByDoc.get(doc); + + List keywords = words.stream().map(w -> w.keyword).toList(); + byte[] metadata = new byte[words.size()]; + for (int i = 0; i < words.size(); i++) { + metadata[i] = words.get(i).termMetadata; + } + var positions = words.stream().map(w -> w.positions).map(pos -> VarintCodedSequence.generate(pos.toIntArray())).toList(); + + indexJournalWriter.put(doc, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + meta.features, + meta.documentMetadata.encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); + }); + + var linkdbWriter = new DocumentDbWriter( + IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME) + ); + for (Long key : allData.keySet()) { + linkdbWriter.add(new DocdbUrlDetail( + key, + new EdgeUrl("https://www.example.com"), + "test", + "test", + 0., + "HTML5", + 0, + null, + 0, + 5 + )); + } + linkdbWriter.close(); + + indexJournalWriter.close(); + constructIndex(); + documentDbReader.reconnect(); + statefulIndex.switchIndex(); + } + } + + record MockDataDocument(int domainId, int ordinal) { + public MockDataDocument(long encodedId) { + this(UrlIdCodec.getDomainId(encodedId), UrlIdCodec.getDocumentOrdinal(encodedId)); + } + + public long docId() { + return UrlIdCodec.encodeId(domainId, ordinal); + } + + } + record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {} + record MockDataKeyword(String keyword, byte termMetadata, IntList positions) {} + + MockDataKeyword w(String keyword, WordFlags flags, int... positions) { + return new MockDataKeyword(keyword, flags.asBit(), IntList.of(positions)); + + } +} diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 7b0a6a24..a2b09d12 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -4,22 +4,20 @@ import com.google.inject.Guice; import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; +import nu.marginalia.api.searchquery.RpcDecoratedResultItem; +import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint; import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.process.control.FakeProcessHeartbeat; -import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.ReverseIndexConstructor; -import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -28,12 +26,15 @@ import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.sequence.VarintCodedSequence; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import nu.marginalia.storage.FileStorageService; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; @@ -41,6 +42,7 @@ import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; @@ -68,7 +70,7 @@ public class IndexQueryServiceIntegrationSmokeTest { ServiceHeartbeat heartbeat; @Inject - IndexJournalWriter indexJournalWriter; + IndexJournalSlopWriter indexJournalWriter; @Inject FileStorageService fileStorageService; @@ -116,28 +118,93 @@ public class IndexQueryServiceIntegrationSmokeTest { SearchSpecification.builder() .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) .queryStrategy(QueryStrategy.SENTENCE) - .year(SpecificationLimit.none()) - .quality(SpecificationLimit.none()) - .size(SpecificationLimit.none()) - .rank(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .domains(new ArrayList<>()) .searchSetIdentifier("NONE") - .query(new SearchQuery( - "2 3 5", - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList())).build()); + .query( + SearchQuery.builder() + .compiledQuery("2 3 5") + .include("3", "5", "2") + .exclude("4") + .build() + ).build()); - int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; - long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); - long[] actual = rsp.results + long[] actual = rsp .stream() - .mapToLong(i -> i.rawIndexResult.getDocumentId()) + .sorted(Comparator.comparing(RpcDecoratedResultItem::getRankingScore)) + .mapToLong(i -> i.getRawItem().getCombinedId()) + .map(UrlIdCodec::getDocumentOrdinal) .toArray(); System.out.println(Arrays.toString(actual)); - System.out.println(Arrays.toString(ids)); - Assertions.assertArrayEquals(ids, actual); + + for (long id : actual) { + Assertions.assertTrue((id % 2) == 0, + "Expected all results to contain the factor 2"); + Assertions.assertTrue((id % 3) == 0, + "Expected all results to contain the factor 2"); + Assertions.assertTrue((id % 5) == 0, + "Expected all results to contain the factor 2"); + } + + Assertions.assertEquals(9, actual.length, + "Expected 10 results"); + Assertions.assertEquals(9, + Arrays.stream(actual).boxed().distinct().count(), + "Results not unique"); + } + + @Test + public void testSimple() throws Exception { + var linkdbWriter = new DocumentDbWriter( + IndexLocations.getLinkdbLivePath(fileStorageService) + .resolve(DOCDB_FILE_NAME) + ); + for (int i = 1; i < 512; i++) { + loadData(linkdbWriter, i); + } + linkdbWriter.close(); + documentDbReader.reconnect(); + + indexJournalWriter.close(); + constructIndex(); + statefulIndex.switchIndex(); + + var rsp = queryService.justQuery( + SearchSpecification.builder() + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) + .queryStrategy(QueryStrategy.SENTENCE) + .rankingParams(ResultRankingParameters.sensibleDefaults()) + .domains(new ArrayList<>()) + .searchSetIdentifier("NONE") + .query( + SearchQuery.builder() + .compiledQuery("2") + .include("2") + .phraseConstraint(new SearchPhraseConstraint.Full("2")) + .build() + ).build() + ); + + long[] actual = rsp + .stream() + .sorted(Comparator.comparing(RpcDecoratedResultItem::getRankingScore)) + .mapToLong(i -> i.getRawItem().getCombinedId()) + .map(UrlIdCodec::getDocumentOrdinal) + .toArray(); + + System.out.println(Arrays.toString(actual)); + + for (long id : actual) { + Assertions.assertTrue((id % 2) == 0, + "Expected all results to contain the factor 2"); + } + + Assertions.assertEquals(10, actual.length, + "Expected 10 results"); + Assertions.assertEquals(10, + Arrays.stream(actual).boxed().distinct().count(), + "Results not unique"); } @Test @@ -160,23 +227,40 @@ public class IndexQueryServiceIntegrationSmokeTest { var rsp = queryService.justQuery( SearchSpecification.builder() .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) - .year(SpecificationLimit.none()) - .quality(SpecificationLimit.none()) - .size(SpecificationLimit.none()) - .rank(SpecificationLimit.none()) .rankingParams(ResultRankingParameters.sensibleDefaults()) .queryStrategy(QueryStrategy.SENTENCE) .domains(List.of(2)) - .query(new SearchQuery( - "2 3 5", - List.of("3", "5", "2"), - List.of("4"), - Collections.emptyList(), - Collections.emptyList(), - Collections.emptyList())).build()); - int[] idxes = new int[] { 210, 270 }; - long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); - long[] actual = rsp.results.stream().mapToLong(i -> i.rawIndexResult.getDocumentId()).toArray(); + .query( + SearchQuery.builder() + .compiledQuery("2 3 5") + .include("3", "5", "2") + .exclude("4") + .phraseConstraint(new SearchPhraseConstraint.Full("2", "3", "5")) + .build() + ).build()); + long[] ids = new long[] { 210, 270 }; + long[] actual = rsp.stream() + .sorted(Comparator.comparing(RpcDecoratedResultItem::getRankingScore)) + .mapToLong(i -> i.getRawItem().getCombinedId()) + .map(UrlIdCodec::getDocumentOrdinal) + .toArray(); + + for (long id : actual) { + System.out.println("Considering " + id); + Assertions.assertTrue((id % 2) == 0, + "Expected all results to contain the factor 2"); + Assertions.assertTrue((id % 3) == 0, + "Expected all results to contain the factor 3"); + Assertions.assertTrue((id % 5) == 0, + "Expected all results to contain the factor 5"); + Assertions.assertTrue((id/100) == 2); + } + + Assertions.assertEquals(2, actual.length, + "Expected 10 results"); + Assertions.assertEquals(2, + Arrays.stream(actual).boxed().distinct().count(), + "Results not unique"); Assertions.assertArrayEquals(ids, actual); } @@ -200,26 +284,27 @@ public class IndexQueryServiceIntegrationSmokeTest { var rsp = queryService.justQuery( SearchSpecification.builder() .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) - .quality(SpecificationLimit.none()) .year(SpecificationLimit.equals(1998)) - .size(SpecificationLimit.none()) - .rank(SpecificationLimit.none()) .queryStrategy(QueryStrategy.SENTENCE) .searchSetIdentifier("NONE") .rankingParams(ResultRankingParameters.sensibleDefaults()) .query( - new SearchQuery("4", List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), Collections.emptyList()) + SearchQuery.builder() + .compiledQuery("4") + .include("4") + .phraseConstraint(new SearchPhraseConstraint.Full("4")) + .build() ).build()); Set years = new HashSet<>(); - for (var res : rsp.results) { - years.add(DocumentMetadata.decodeYear(res.rawIndexResult.encodedDocMetadata)); + for (var res : rsp) { + years.add(DocumentMetadata.decodeYear(res.getRawItem().getEncodedDocMetadata())); } assertEquals(Set.of(1998), years); - assertEquals(rsp.results.size(), 10); + assertEquals(rsp.size(), 10); } @@ -235,38 +320,54 @@ public class IndexQueryServiceIntegrationSmokeTest { Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); - new ReverseIndexConstructor(outputFileDocs, outputFileWords, IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir) - .createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + var constructor = new FullIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + DocIdRewriter.identity(), + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); } private void createPrioReverseIndex() throws SQLException, IOException { Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); - new ReverseIndexConstructor(outputFileDocs, outputFileWords, IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir) - .createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + var constructor = new PrioIndexConstructor( + outputFileDocs, + outputFileWords, + DocIdRewriter.identity(), + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); } private void createForwardIndex() throws SQLException, IOException { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); @@ -277,7 +378,6 @@ public class IndexQueryServiceIntegrationSmokeTest { return UrlIdCodec.encodeId((32 - (id % 32)), id); } - MurmurHash3_128 hasher = new MurmurHash3_128(); @SneakyThrows public void loadData(DocumentDbWriter ldbw, int id) { int[] factors = IntStream @@ -285,43 +385,82 @@ public class IndexQueryServiceIntegrationSmokeTest { .filter(v -> (id % v) == 0) .toArray(); + System.out.println("id:" + id + " factors: " + Arrays.toString(factors)); + long fullId = fullId(id); - var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); - - long[] data = new long[factors.length * 2]; - for (int i = 0; i < factors.length; i++) { - data[2 * i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); - data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } - ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), - "test", "test", 0., "HTML5", 0, null, 0, 10 + "test", "test", 0., "HTML5", 0, null, fullId, 10 )); - indexJournalWriter.put(header, new IndexJournalEntryData(data)); + List keywords = IntStream.of(factors).mapToObj(Integer::toString).toList(); + byte[] metadata = new byte[factors.length]; + for (int i = 0; i < factors.length; i++) { + metadata[i] = WordFlags.Title.asBit(); + } + + List positions = new ArrayList<>(); + + ByteBuffer wa = ByteBuffer.allocate(32); + for (int i = 0; i < factors.length; i++) { + positions.add(VarintCodedSequence.generate(factors)); + } + + indexJournalWriter.put(fullId, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + 0, + new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); + } @SneakyThrows public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) { int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); long fullId = UrlIdCodec.encodeId(domain, id); - var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue()); - - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); - } ldbw.add(new DocdbUrlDetail( fullId, new EdgeUrl("https://www.example.com/"+id), - "test", "test", 0., "HTML5", 0, null, 0, 10 + "test", "test", 0., "HTML5", 0, null, id, 10 )); - indexJournalWriter.put(header, new IndexJournalEntryData(data)); + List keywords = IntStream.of(factors).mapToObj(Integer::toString).toList(); + byte[] metadata = new byte[factors.length]; + for (int i = 0; i < factors.length; i++) { + metadata[i] = WordFlags.Title.asBit(); + } + + List positions = new ArrayList<>(); + + ByteBuffer wa = ByteBuffer.allocate(32); + for (int i = 0; i < factors.length; i++) { + positions.add(VarintCodedSequence.generate(i + 1)); + } + + indexJournalWriter.put(fullId, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + 0, + new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); + } } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index e29f8751..87f53cf3 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -2,21 +2,22 @@ package nu.marginalia.index; import com.google.inject.Guice; import com.google.inject.Inject; +import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.IndexLocations; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; +import nu.marginalia.api.searchquery.model.query.SearchPhraseConstraint; import nu.marginalia.api.searchquery.model.query.SearchQuery; +import nu.marginalia.api.searchquery.model.query.SearchSpecification; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.index.index.StatefulIndex; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; -import nu.marginalia.index.construction.ReverseIndexConstructor; -import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.reader.IndexJournalReader; -import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; @@ -29,12 +30,13 @@ import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.sequence.VarintCodedSequence; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; +import nu.marginalia.storage.FileStorageService; import org.apache.logging.log4j.util.Strings; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; @@ -71,7 +73,7 @@ public class IndexQueryServiceIntegrationTest { ServiceHeartbeat heartbeat; @Inject - IndexJournalWriter indexJournalWriter; + IndexJournalSlopWriter indexJournalWriter; @Inject FileStorageService fileStorageService; @@ -172,37 +174,13 @@ public class IndexQueryServiceIntegrationTest { List.of(), List.of(), List.of(), - List.of(List.of("missing", "hello")) + List.of(SearchPhraseConstraint.mandatory(List.of("missing", "hello"))) ))); executeSearch(queryMissingCoherence) .expectCount(0); } - @Test - public void testPositions() throws Exception { - - // Test position rules - new MockData() - .add( // Case 1: Both words have a position set, should be considered - d(1, 1), - new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) - ).add( // Case 2: Only one of the words have a position set, should not be considered - d(2, 2), - new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode()) - ).load(); - - - var query = basicQuery(builder -> builder.query(justInclude("hello", "world"))); - - executeSearch(query) - .expectDocumentsInOrder(d(1,1)); - } - @Test public void testYear() throws Exception { @@ -211,19 +189,19 @@ public class IndexQueryServiceIntegrationTest { .add( // Case 1: Document is dated 1999 d(1, 1), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ).add( // Case 2: Document is dated 2000 d(2, 2), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ) .add( // Case 2: Document is dated 2001 d(3, 3), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ) .load(); @@ -266,26 +244,26 @@ public class IndexQueryServiceIntegrationTest { .add( d(1, 1), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ).add( d(1, 2), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ) // docs from domain 2 .add( d(2, 1), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ) .add( d(2, 2), new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ) .load(); @@ -319,13 +297,13 @@ public class IndexQueryServiceIntegrationTest { .add( // Case 1: The required include is present, exclude is absent; should be a result d(1, 1), new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ).add( // Case 2: The required include is present, excluded term is absent; should not be a result d(2, 2), new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("my_darling", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("my_darling", EnumSet.noneOf(WordFlags.class), 1) ).load(); var query = basicQuery(builder -> @@ -368,8 +346,8 @@ public class IndexQueryServiceIntegrationTest { System.out.println(rsp); - for (var result : rsp.results) { - long docId = result.rawIndexResult.getDocumentId(); + for (var result : rsp) { + long docId = result.getRawItem().getCombinedId(); actual.add(new MockDataDocument(UrlIdCodec.getDomainId(docId), UrlIdCodec.getDocumentOrdinal(docId))); } @@ -386,14 +364,14 @@ public class IndexQueryServiceIntegrationTest { .add( // Case 1: Both positions overlap; should be included d(1, 1), new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 1) ) .add( // Case 2: Positions do not overlap, do not include d(2, 2), new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), - w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), - w("world", new WordMetadata(2L, EnumSet.noneOf(WordFlags.class)).encode()) + w("hello", EnumSet.noneOf(WordFlags.class), 1), + w("world", EnumSet.noneOf(WordFlags.class), 2) ) .load(); @@ -403,9 +381,9 @@ public class IndexQueryServiceIntegrationTest { includeAndCohere("hello", "world") ))); - assertEquals(1, rsp.results.size()); - assertEquals(d(1,1).docId(), - rsp.results.get(0).rawIndexResult.getDocumentId()); + assertEquals(1, rsp.size()); + assertEquals(d(2,2).docId(), + rsp.get(0).getRawItem().getCombinedId()); } SearchSpecification basicQuery(Function mutator) @@ -464,7 +442,7 @@ public class IndexQueryServiceIntegrationTest { List.of(), List.of(), List.of(), - List.of(List.of(includes)) + List.of(SearchPhraseConstraint.mandatory(List.of(includes))) ); } private MockDataDocument d(int domainId, int ordinal) { @@ -482,13 +460,21 @@ public class IndexQueryServiceIntegrationTest { Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); - new ReverseIndexConstructor(outputFileDocs, outputFileWords, IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir) - .createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + var constructor = + new FullIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + DocIdRewriter.identity(), + tmpDir); + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); } private void createPrioReverseIndex() throws SQLException, IOException { @@ -500,20 +486,27 @@ public class IndexQueryServiceIntegrationTest { if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); - new ReverseIndexConstructor(outputFileDocs, outputFileWords, IndexJournalReader::singleFile, DocIdRewriter.identity(), tmpDir) - .createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); + var constructor = new PrioIndexConstructor( + outputFileDocs, + outputFileWords, + DocIdRewriter.identity(), + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir); } private void createForwardIndex() throws SQLException, IOException { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); @@ -543,19 +536,31 @@ public class IndexQueryServiceIntegrationTest { var meta = metaByDoc.get(doc); - var header = new IndexJournalEntryHeader( - doc, - meta.features, - meta.documentMetadata.encode() - ); + List keywords = words.stream().map(w -> w.keyword).toList(); - long[] dataArray = new long[words.size() * 2]; + byte[] metadata = new byte[keywords.size()]; for (int i = 0; i < words.size(); i++) { - dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword); - dataArray[2*i+1] = words.get(i).termMetadata; + metadata[i] = (byte) words.get(i).termMetadata; } - var entry = new IndexJournalEntryData(dataArray); - indexJournalWriter.put(header, entry); + + List positions = new ArrayList<>(); + for (int i = 0; i < words.size(); i++) { + positions.add(VarintCodedSequence.generate(words.get(i).positions)); + } + + indexJournalWriter.put(doc, + new SlopDocumentRecord.KeywordsProjection( + "", + -1, + meta.features, + meta.documentMetadata.encode(), + 100, + keywords, + metadata, + positions, + new byte[0], + List.of() + )); }); var linkdbWriter = new DocumentDbWriter( @@ -571,7 +576,7 @@ public class IndexQueryServiceIntegrationTest { "HTML5", 0, null, - 0, + key.hashCode(), 5 )); } @@ -595,9 +600,11 @@ public class IndexQueryServiceIntegrationTest { this(features, new DocumentMetadata(encoded)); } } - record MockDataKeyword(String keyword, long termMetadata) {} + record MockDataKeyword(String keyword, long termMetadata, IntList positions) {} - public MockDataKeyword w(String keyword, long termMetadata) { return new MockDataKeyword(keyword, termMetadata); } - public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L); } - public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode()); } + public MockDataKeyword w(String keyword, EnumSet wordFlags, int... positions) { + return new MockDataKeyword(keyword, WordFlags.encode(wordFlags), IntList.of(positions)); + } + public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L, IntList.of()); } + public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, flags.asBit(), IntList.of()); } } diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java index e61c42d7..e2438709 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTestModule.java @@ -2,21 +2,23 @@ package nu.marginalia.index; import com.google.inject.AbstractModule; import nu.marginalia.IndexLocations; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.searchset.SearchSetAny; import nu.marginalia.index.searchset.SearchSetsService; -import nu.marginalia.index.util.TestUtil; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBase; -import nu.marginalia.storage.model.FileStorageBaseType; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; import nu.marginalia.linkdb.docs.DocumentDbReader; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; -import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.service.control.*; import nu.marginalia.service.ServiceId; +import nu.marginalia.service.control.FakeServiceHeartbeat; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBase; +import nu.marginalia.storage.model.FileStorageBaseType; +import nu.marginalia.test.TestUtil; import org.mockito.Mockito; import java.io.IOException; @@ -41,8 +43,10 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { slowDir = workDir.resolve("slow"); fastDir = workDir.resolve("fast"); + Files.createDirectory(slowDir); Files.createDirectory(fastDir); + Files.createDirectory(fastDir.resolve("iw")); } public void cleanUp() { @@ -75,9 +79,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); - bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl( - IndexLocations.getIndexConstructionArea(fileStorageServiceMock) - )); + bind(IndexJournalSlopWriter.class).toInstance(new IndexJournalSlopWriter(IndexJournal.allocateName(fastDir.resolve("iw")), 0)); bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( ServiceId.Index, diff --git a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java index f4740e31..af071e24 100644 --- a/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java +++ b/code/index/test/nu/marginalia/index/results/IndexResultDomainDeduplicatorTest.java @@ -25,7 +25,7 @@ class IndexResultDomainDeduplicatorTest { } SearchResultItem forId(int domain, int ordinal) { - return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), 4, false, Double.NaN); + return new SearchResultItem(UrlIdCodec.encodeId(domain, ordinal), 0, 0, List.of(), false, 0L, null, Double.NaN); } } \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java b/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java deleted file mode 100644 index 41906904..00000000 --- a/code/index/test/nu/marginalia/ranking/results/ResultValuatorTest.java +++ /dev/null @@ -1,100 +0,0 @@ -package nu.marginalia.ranking.results; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.CqDataInt; -import nu.marginalia.api.searchquery.model.results.ResultRankingContext; -import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.idx.WordMetadata; -import nu.marginalia.ranking.results.factors.*; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; - -import java.util.*; - -import static org.mockito.Mockito.when; - -class ResultValuatorTest { - - TermFrequencyDict dict; - ResultValuator valuator; - - @BeforeEach - public void setUp() { - - dict = Mockito.mock(TermFrequencyDict.class); - when(dict.docCount()).thenReturn(100_000); - - valuator = new ResultValuator( - new TermCoherenceFactor() - ); - - } - - CqDataInt frequencyData = new CqDataInt(new int[] { 10 }); - - CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title))) - ).mapToLong(SearchResultKeywordScore::encodedWordMetadata); - - CompiledQueryLong highCountNoTitleSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh))) - ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; - - CompiledQueryLong highCountSubjectSet = CompiledQuery.just( - new SearchResultKeywordScore("bob", 1, - wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects))) - ).mapToLong(SearchResultKeywordScore::encodedWordMetadata);; - - - @Test - void evaluateTerms() { - - when(dict.getTermFreq("bob")).thenReturn(10); - ResultRankingContext context = new ResultRankingContext(100000, - ResultRankingParameters.sensibleDefaults(), - new BitSet(), - new BitSet(), - frequencyData, - frequencyData); - - long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class)); - int features = 0; - - double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null); - double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null); - double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context, null); - double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context, null); - - System.out.println(titleOnlyLowCount); - System.out.println(titleLongOnlyLowCount); - System.out.println(highCountNoTitle); - System.out.println(highCountSubject); - } - - private long docMetadata(int topology, - int year, - int quality, - EnumSet flags) { - return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode(); - } - - private long wordMetadata(Set positions, Set wordFlags) { - long posBits = positions.stream() - .mapToLong(i -> ((1L << i) & 0xFF_FFFF_FFFF_FFFFL)) - .reduce((a,b) -> a|b) - .orElse(0L); - - return new WordMetadata(posBits, wordFlags).encode(); - } - -} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java b/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java deleted file mode 100644 index 5d2b47c9..00000000 --- a/code/index/test/nu/marginalia/ranking/results/factors/TermCoherenceFactorTest.java +++ /dev/null @@ -1,107 +0,0 @@ -package nu.marginalia.ranking.results.factors; - -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates; -import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore; -import nu.marginalia.bbpc.BrailleBlockPunchCards; -import nu.marginalia.model.idx.WordMetadata; -import org.junit.jupiter.api.Test; - -import java.util.ArrayList; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; - -class TermCoherenceFactorTest { - - TermCoherenceFactor termCoherenceFactor = new TermCoherenceFactor(); - @Test - public void testAllBitsSet() { - var allPositionsSet = createSet( - ~0L, - ~0L - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate( - allPositionsSet, - SearchResultKeywordScore::positions - ); - - assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - - assertEquals(1.0, - termCoherenceFactor.calculateOverlap( - allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata) - ) - ); - - } - - @Test - public void testNoBitsSet() { - var allPositionsSet = createSet( - 0, 0 - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK); - - assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01); - - assertEquals(0, termCoherenceFactor.calculateOverlap(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata))); - } - - @Test @SuppressWarnings("unchecked") - public void testLowPosMatches() { - var positions = createSet( - List.of(0, 1, 2, 3), List.of(0, 1, 2, 3) - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); - printMask(mask); - - } - - @Test @SuppressWarnings("unchecked") - public void testHiPosMatches() { - var positions = createSet( - List.of(55, 54, 53, 52), List.of(55, 54, 53, 52) - ); - - long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK); - printMask(mask); - } - - @Test - public void testBitMatchScaling() { - for (int i = 1; i < 48; i++) { - System.out.println(i + ":" + termCoherenceFactor.bitsSetFactor((1L << i) - 1)); - } - } - - void printMask(long mask) { - System.out.println(BrailleBlockPunchCards.printBits(mask, 48)); - } - - CompiledQuery createSet(List... maskPositions) { - long[] positions = new long[maskPositions.length]; - - for (int i = 0; i < maskPositions.length; i++) { - for (long pos : maskPositions[i]) { - positions[i] |= (1L< createSet(long... positionMasks) { - List keywords = new ArrayList<>(); - - for (int i = 0; i < positionMasks.length; i++) { - keywords.add(new SearchResultKeywordScore("", 0, - new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode())); - } - - return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new)); - } -} \ No newline at end of file diff --git a/code/libraries/array/build.gradle b/code/libraries/array/build.gradle index 4c88a870..862f3a69 100644 --- a/code/libraries/array/build.gradle +++ b/code/libraries/array/build.gradle @@ -26,6 +26,8 @@ dependencies { testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + + testImplementation project(':code:libraries:test-helpers') } jmh { diff --git a/code/libraries/array/cpp/resources/.gitignore b/code/libraries/array/cpp/resources/.gitignore index 82ac343f..f1fe8d1e 100644 --- a/code/libraries/array/cpp/resources/.gitignore +++ b/code/libraries/array/cpp/resources/.gitignore @@ -1 +1 @@ -libcpp.so \ No newline at end of file +*.so \ No newline at end of file diff --git a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java index b5ef03da..5ce59973 100644 --- a/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java +++ b/code/libraries/array/java/nu/marginalia/array/algo/LongArrayBase.java @@ -108,4 +108,5 @@ public interface LongArrayBase extends BulkTransferArray { void write(Path file) throws IOException; void transferFrom(FileChannel source, long sourceStart, long arrayStart, long arrayEnd) throws IOException; + void transferFrom(LongArray source, long sourceStart, long arrayStart, long arrayEnd) throws IOException; } diff --git a/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java b/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java index 1a270af7..ba1bd2b3 100644 --- a/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java +++ b/code/libraries/array/java/nu/marginalia/array/page/LongQueryBuffer.java @@ -3,6 +3,7 @@ package nu.marginalia.array.page; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; +import java.nio.ByteBuffer; import java.util.Arrays; /** A buffer for long values that can be used to filter and manipulate the data. @@ -164,6 +165,11 @@ public class LongQueryBuffer { finalizeFiltering(); } + @SuppressWarnings("preview") + public ByteBuffer asByteBuffer() { + return data.getMemorySegment().asByteBuffer(); + } + public String toString() { return getClass().getSimpleName() + "[" + "read = " + read + diff --git a/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java b/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java index ac420de9..c87b16f0 100644 --- a/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java +++ b/code/libraries/array/java/nu/marginalia/array/page/SegmentLongArray.java @@ -167,6 +167,28 @@ public class SegmentLongArray implements LongArray { } } + + @Override + public void transferFrom(LongArray source, + long sourceStartL, + long destStartL, + long destEndL) + { + if (destStartL > destEndL) + throw new IndexOutOfBoundsException("Source start after end"); + + if (sourceStartL + (destEndL - destStartL) > source.size()) + throw new IndexOutOfBoundsException("Source array too small"); + if (destEndL > size()) + throw new IndexOutOfBoundsException("Destination array too small"); + + MemorySegment.copy( + source.getMemorySegment(), JAVA_LONG, sourceStartL * JAVA_LONG.byteSize(), + segment, JAVA_LONG, destStartL * JAVA_LONG.byteSize(), + destEndL - destStartL + ); + + } @Override public MemorySegment getMemorySegment() { diff --git a/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java b/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java index 04ea42d4..f4c47dd4 100644 --- a/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java +++ b/code/libraries/array/java/nu/marginalia/array/page/UnsafeLongArray.java @@ -269,4 +269,26 @@ public class UnsafeLongArray implements LongArray { } } + @Override + public void transferFrom(LongArray source, + long sourceStartL, + long destStartL, + long destEndL) + { + if (destStartL > destEndL) + throw new IndexOutOfBoundsException("Source start after end"); + + if (sourceStartL + (destEndL - destStartL) > source.size()) + throw new IndexOutOfBoundsException("Source array too small"); + if (destEndL > size()) + throw new IndexOutOfBoundsException("Destination array too small"); + + MemorySegment.copy( + source.getMemorySegment(), JAVA_LONG, sourceStartL * JAVA_LONG.byteSize(), + segment, JAVA_LONG, destStartL * JAVA_LONG.byteSize(), + destEndL - destStartL + ); + + } + } diff --git a/code/libraries/array/readme.md b/code/libraries/array/readme.md index f656b3e1..073be1eb 100644 --- a/code/libraries/array/readme.md +++ b/code/libraries/array/readme.md @@ -36,7 +36,7 @@ try (var array = LongArrayFactory.mmapForWritingConfined(Path.of("/tmp/test"), 1 ## Query Buffers -The class and [LongQueryBuffer](java/nu/marginalia/array/buffer/LongQueryBuffer.java) is used heavily in the search engine's query processing. +The class and [LongQueryBuffer](java/nu/marginalia/array/page/LongQueryBuffer.java) is used heavily in the search engine's query processing. It is a dual-pointer buffer that offers tools for filtering data. diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java index dfbf555e..a866264d 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortNTest.java @@ -4,7 +4,7 @@ import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.util.test.TestUtil; +import nu.marginalia.test.TestUtil; import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; @@ -13,7 +13,7 @@ import org.junit.jupiter.api.Test; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; -import java.util.*; +import java.util.Random; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; diff --git a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java index 2cfde5a7..4619b6a9 100644 --- a/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java +++ b/code/libraries/array/test/nu/marginalia/array/algo/LongArraySortTest.java @@ -3,7 +3,7 @@ package nu.marginalia.array.algo; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; -import nu.marginalia.util.test.TestUtil; +import nu.marginalia.test.TestUtil; import org.apache.commons.lang3.ArrayUtils; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Tag; diff --git a/code/libraries/array/test/nu/marginalia/util/test/TestUtil.java b/code/libraries/array/test/nu/marginalia/util/test/TestUtil.java deleted file mode 100644 index e3defec1..00000000 --- a/code/libraries/array/test/nu/marginalia/util/test/TestUtil.java +++ /dev/null @@ -1,43 +0,0 @@ -package nu.marginalia.util.test; - - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Arrays; - -public class TestUtil { - public static void clearTempDir(Path dir) { - if (Files.isDirectory(dir)) { - for (File f : dir.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); - } - System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); - f.delete(); - } - } - System.out.println("Deleting " + dir); - dir.toFile().delete(); - } - - private static String fileSize(Path path) { - try { - long sizeBytes = Files.size(path); - - if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb"; - if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb"; - if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb"; - return sizeBytes + "b"; - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - private static String round(double d) { - return String.format("%.2f", d); - } -} diff --git a/code/libraries/btree/build.gradle b/code/libraries/btree/build.gradle index bdfb803d..39479864 100644 --- a/code/libraries/btree/build.gradle +++ b/code/libraries/btree/build.gradle @@ -12,7 +12,6 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:libraries:array') - implementation project(':code:libraries:next-prime') implementation libs.bundles.slf4j diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java index d847e3ad..b4bad4c1 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithIndexTest.java @@ -4,7 +4,6 @@ import nu.marginalia.array.LongArray; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.btree.model.BTreeBlockSize; import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.util.NextPrimeUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java index db8d9460..c64658d5 100644 --- a/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java +++ b/code/libraries/btree/test/nu/marginalia/btree/BTreeReaderRejectRetainWithoutIndexTest.java @@ -4,7 +4,6 @@ import nu.marginalia.array.LongArray; import nu.marginalia.array.page.LongQueryBuffer; import nu.marginalia.btree.model.BTreeBlockSize; import nu.marginalia.btree.model.BTreeContext; -import nu.marginalia.util.NextPrimeUtil; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/libraries/next-prime/java/nu/marginalia/util/NextPrimeUtil.java b/code/libraries/btree/test/nu/marginalia/btree/NextPrimeUtil.java similarity index 96% rename from code/libraries/next-prime/java/nu/marginalia/util/NextPrimeUtil.java rename to code/libraries/btree/test/nu/marginalia/btree/NextPrimeUtil.java index 183344b7..656f912e 100644 --- a/code/libraries/next-prime/java/nu/marginalia/util/NextPrimeUtil.java +++ b/code/libraries/btree/test/nu/marginalia/btree/NextPrimeUtil.java @@ -1,4 +1,4 @@ -package nu.marginalia.util; +package nu.marginalia.btree; public class NextPrimeUtil { diff --git a/code/libraries/next-prime/build.gradle b/code/libraries/coded-sequence/build.gradle similarity index 50% rename from code/libraries/next-prime/build.gradle rename to code/libraries/coded-sequence/build.gradle index 425d2c12..9de183f0 100644 --- a/code/libraries/next-prime/build.gradle +++ b/code/libraries/coded-sequence/build.gradle @@ -1,5 +1,6 @@ plugins { id 'java' + id "me.champeau.jmh" version "0.6.6" } java { @@ -13,8 +14,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation libs.bundles.slf4j - implementation libs.notnull - + implementation libs.slop implementation libs.fastutil testImplementation libs.bundles.slf4j.test @@ -25,3 +25,15 @@ dependencies { test { useJUnitPlatform() } + +jmh { + jvmArgs = [ "--enable-preview" ] +} +tasks.withType(me.champeau.jmh.WithJavaToolchain).configureEach { + javaLauncher.set(javaToolchains.launcherFor { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + }) +} +tasks.withType(me.champeau.jmh.JmhBytecodeGeneratorTask).configureEach { + jvmArgs = ["--enable-preview"] +} \ No newline at end of file diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java new file mode 100644 index 00000000..493e87ee --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/CodedSequence.java @@ -0,0 +1,23 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; + +import java.nio.ByteBuffer; + +public interface CodedSequence { + byte[] bytes(); + + IntIterator iterator(); + + IntIterator offsetIterator(int offset); + + IntList values(); + + ByteBuffer buffer(); + + int bufferSize(); + + int valueCount(); + +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java new file mode 100644 index 00000000..bfb3a548 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/GammaCodedSequence.java @@ -0,0 +1,263 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.sequence.io.BitReader; +import nu.marginalia.sequence.io.BitWriter; + +import java.nio.ByteBuffer; +import java.util.Objects; +import java.util.StringJoiner; + +/** A sequence of integers encoded using the Elias Gamma code, + * the class wraps a ByteBuffer containing the encoded sequence, + * and offers convenience methods for decoding and iterating + * over the data. + * */ +public class GammaCodedSequence implements Iterable, CodedSequence { + private final ByteBuffer raw; + + private final int startPos; + private final int startLimit; + + /** Create a new GammaCodedSequence from a sequence of integers. + * + * The sequence must be strictly increasing and may not contain + * values less than or equal to zero. + * */ + public static GammaCodedSequence generate(ByteBuffer workArea, int... values) { + return new GammaCodedSequence(encode(workArea, values)); + } + + /** Create a new GammaCodedSequence from a sequence of integers. + * + * The sequence must be strictly increasing and may not contain + * values less than or equal to zero. + * */ + public static GammaCodedSequence generate(ByteBuffer workArea, IntList values) { + return new GammaCodedSequence(encode(workArea, values)); + } + + public GammaCodedSequence(ByteBuffer bytes) { + this.raw = bytes; + startPos = bytes.position(); + startLimit = bytes.limit(); + } + + public GammaCodedSequence(ByteBuffer bytes, int startPos, int startLimit) { + this.raw = bytes; + this.startPos = startPos; + this.startLimit = startLimit; + } + + public GammaCodedSequence(byte[] bytes) { + raw = ByteBuffer.allocate(bytes.length); + raw.put(bytes); + raw.clear(); + startPos = 0; + startLimit = bytes.length; + } + + /** Return the raw bytes of the sequence. */ + @Override + public byte[] bytes() { + if (raw.hasArray()) { + return raw.array(); + } + else { + byte[] bytes = new byte[raw.capacity()]; + raw.get(0, bytes, 0, bytes.length); + return bytes; + } + } + + @Override + public IntIterator iterator() { + raw.position(startPos); + raw.limit(startLimit); + + return new EliasGammaSequenceIterator(raw); + } + + /** Return an iterator over the sequence with a constant offset applied to each value. + * This is useful for comparing sequences with different offsets, and adds zero + * extra cost to the decoding process which is already based on adding + * relative differences. + * */ + public IntIterator offsetIterator(int offset) { + raw.position(startPos); + raw.limit(startLimit); + + return new EliasGammaSequenceIterator(raw, offset); + } + + public IntList values() { + var intItr = new EliasGammaSequenceIterator(buffer()); + IntArrayList ret = new IntArrayList(intItr.rem); + while (intItr.hasNext()) { + ret.add(intItr.nextInt()); + } + return ret; + } + + public int hashCode() { + return values().hashCode(); + } + + public boolean equals(Object other) { + if (other instanceof CodedSequence cs) { + return Objects.equals(values(), cs.values()); + } + return false; + } + + public String toString() { + StringJoiner sj = new StringJoiner(", ", "[", "]"); + for (Integer i : this) { + sj.add(i.toString()); + } + return sj.toString(); + } + + /** Return the backing ByteBuffer of the sequence, configured with a position and limit + * that is equal to the relevant data range + */ + public ByteBuffer buffer() { + raw.position(startPos); + raw.limit(startLimit); + + return raw; + } + + /** Return the number of bytes used by the sequence in the buffer */ + public int bufferSize() { + return startLimit - startPos; + } + + /** Return the number of items in the sequence */ + public int valueCount() { + if (startPos == startLimit) + return 0; + + return EliasGammaSequenceIterator.readCount(buffer()); + } + + + /** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code. + * The sequence must be strictly increasing and may not contain values less than + * or equal to zero. + */ + public static ByteBuffer encode(ByteBuffer workArea, IntList sequence) { + var writer = new BitWriter(workArea); + + writer.putGamma(sequence.size() + 1); + + int last = 0; + + for (var iter = sequence.iterator(); iter.hasNext(); ) { + int i = iter.nextInt(); + int delta = i - last; + last = i; + + // can't encode zeroes + assert delta > 0 : "Sequence must be strictly increasing and may not contain zeroes or negative values; was " + sequence; + + writer.putGamma(delta); + } + + // Finish the writer and return the work buffer, positioned and limited around + // the relevant data + + var buffer = writer.finish(); + + // Copy the contents of the writer's internal buffer to a new ByteBuffer that is correctly sized, + // this lets us re-use the internal buffer for subsequent calls to encode without worrying about + // accidentally overwriting the previous data. + + var outBuffer = ByteBuffer.allocate(buffer.limit()); + outBuffer.put(buffer); + outBuffer.flip(); + + return outBuffer; + } + + /** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code. + * The sequence must be strictly increasing and may not contain values less than + * or equal to zero. + */ + public static ByteBuffer encode(ByteBuffer workArea, int[] sequence) { + return encode(workArea, IntList.of(sequence)); + } + + /** Iterator that implements decoding of sequences of integers using the Elias Gamma code. + * The sequence is prefixed by the number of integers in the sequence, then the delta between + * each integer in the sequence is encoded using the Elias Gamma code. + *

+ * https://en.wikipedia.org/wiki/Elias_gamma_coding + * */ + public static class EliasGammaSequenceIterator implements IntIterator { + + private final BitReader reader; + int rem = 0; + private int last; + private int next = Integer.MIN_VALUE; + + public EliasGammaSequenceIterator(ByteBuffer buffer, int zero) { + if (zero == Integer.MIN_VALUE) { + throw new IllegalArgumentException("Integer.MIN_VALUE is a reserved offset that may not be used as zero point"); + } + + reader = new BitReader(buffer); + + last = zero; + rem = reader.getGamma() - 1; + } + + public EliasGammaSequenceIterator(ByteBuffer buffer) { + this(buffer, 0); + } + + public static int readCount(ByteBuffer buffer) { + var reader = new BitReader(buffer); + + return reader.getGamma() - 1; + } + + + + // This is BitWriter.getGamma with more checks in place for streaming iteration + @Override + public boolean hasNext() { + if (next != Integer.MIN_VALUE) return true; + if (!reader.hasMore() || --rem < 0) return false; + + int bits = 1 + reader.takeWhileZero(); + + if (reader.hasMore()) { + int delta = reader.get(bits); + + last += delta; + next = last; + + return true; + } + + return false; + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = next; + next = Integer.MIN_VALUE; + return ret; + } + throw new ArrayIndexOutOfBoundsException("No more data to read"); + } + + + } + + +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java new file mode 100644 index 00000000..76a0be27 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java @@ -0,0 +1,230 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; + +public class SequenceOperations { + + /** Return true if the sequences intersect, false otherwise. + * */ + public static boolean intersectSequences(IntIterator... sequences) { + + if (sequences.length <= 1) + return true; + + // Initialize values and find the maximum value + int[] values = new int[sequences.length]; + + for (int i = 0; i < sequences.length; i++) { + if (sequences[i].hasNext()) + values[i] = sequences[i].nextInt(); + else + return false; + } + + // Intersect the sequences by advancing all values smaller than the maximum seen so far + // until they are equal to the maximum value, or until the end of the sequence is reached + int max = Integer.MIN_VALUE; + int successes = 0; + for (int i = 0; successes < sequences.length; i = (i + 1) % sequences.length) + { + if (values[i] == max) { + successes++; + } else { + successes = 1; + + // Discard values until we reach the maximum value seen so far, + // or until the end of the sequence is reached + while (values[i] < max) { + if (sequences[i].hasNext()) + values[i] = sequences[i].nextInt(); + else + return false; + } + + // Update the maximum value, if necessary + max = Math.max(max, values[i]); + } + } + + return true; + } + + /** Find any intersections between the given positions lists, and return the list of intersections. + * If any of the lists are empty, return an empty list. + *

+ */ + public static IntList findIntersections(IntList... positions) { + return findIntersections(positions, new int[positions.length]); + } + + /** Find any intersections between the given positions lists, and return the list of intersections. + * If any of the lists are empty, return an empty list. + *

+ * A constant offset can be applied to each position list by providing an array of offsets. + * + * @param positions the positions lists to compare - each list must be sorted in ascending order + * and contain unique values. + * @param offsets constant offsets to apply to each position + * */ + public static IntList findIntersections(IntList[] positions, int[] offsets) { + + if (positions.length < 1) + return IntList.of(); + + int[] indexes = new int[positions.length]; + // Initialize values and find the maximum value + int[] values = new int[positions.length]; + + for (int i = 0; i < positions.length; i++) { + if (indexes[i] < positions[i].size()) + values[i] = positions[i].getInt(indexes[i]++) + offsets[i]; + else + return IntList.of(); + } + + // Intersect the sequences by advancing all values smaller than the maximum seen so far + // until they are equal to the maximum value, or until the end of the sequence is reached + int max = Integer.MIN_VALUE; + int successes = 0; + + IntList ret = new IntArrayList(); + + outer: + for (int i = 0;; i = (i + 1) % positions.length) + { + if (successes == positions.length) { + ret.add(max); + successes = 1; + + if (indexes[i] < positions[i].size()) { + values[i] = positions[i].getInt(indexes[i]++) + offsets[i]; + + // Update the maximum value, if necessary + max = Math.max(max, values[i]); + } else { + break; + } + } else if (values[i] == max) { + successes++; + } else { + successes = 1; + + // Discard values until we reach the maximum value seen so far, + // or until the end of the sequence is reached + while (values[i] < max) { + if (indexes[i] < positions[i].size()) { + values[i] = positions[i].getInt(indexes[i]++) + offsets[i]; + } else { + break outer; + } + } + + // Update the maximum value, if necessary + max = Math.max(max, values[i]); + } + } + + return ret; + } + + + /** Given each set of positions, one from each list, find the set with the smallest distance between them + * and return that distance. If any of the lists are empty, return 0. + * */ + public static int minDistance(IntList[] positions) { + return minDistance(positions, new int[positions.length]); + } + + /** Given each set of positions, one from each list, find the set with the smallest distance between them + * and return that distance. If any of the lists are empty, return 0. + * + * @param positions the positions lists to compare - each list must be sorted in ascending order + * @param offsets the offsets to apply to each position + */ + public static int minDistance(IntList[] positions, int[] offsets) { + if (positions.length <= 1) + return 0; + + int[] values = new int[positions.length]; + int[] indexes = new int[positions.length]; + + for (int i = 0; i < positions.length; i++) { + if (indexes[i] < positions[i].size()) + values[i] = positions[i].getInt(indexes[i]++) + offsets[i]; + else + return 0; + } + + int minDist = Integer.MAX_VALUE; + int maxVal = Integer.MIN_VALUE; + + int maxI = 0; + + // Find the maximum value in values[] and its index in positions[] + for (int i = 0; i < positions.length; i++) { + if (values[i] > maxVal) { + maxVal = values[i]; + maxI = i; + } + } + + for (;;) { + // For all the other indexes except maxI, update values[] with the largest value smaller than maxVal + for (int idx = 0; idx < positions.length - 1; idx++) { + int i = (maxI + idx) % positions.length; + + // Update values[i] until it is the largest value smaller than maxVal + + int len = positions[i].size(); + int offset = offsets[i]; + int prevValue = values[i]; + int value = prevValue; + + while (indexes[i] < len) { + prevValue = value; + value = positions[i].getInt(indexes[i]++) + offset; + if (value >= maxVal) { + indexes[i]--; // correct for overshooting the largest value smaller than maxVal + break; + } + } + + values[i] = prevValue; + } + + // Calculate minVal and update minDist + int minVal = Integer.MAX_VALUE; + for (int val : values) { + minVal = Math.min(minVal, val); + } + minDist = Math.min(minDist, maxVal - minVal); + + + // Find the next maximum value and its index. We look for the largest value smaller than the current maxVal, + // which is the next target value + maxVal = Integer.MAX_VALUE; + + for (int i = 0; i < positions.length; i++) { + int index = indexes[i]; + if (index >= positions[i].size()) { // no more values in this list, skip + continue; + } + + int value = positions[i].getInt(index) + offsets[i]; + if (value < maxVal) { + maxVal = value; + maxI = i; + } + } + + if (maxVal != Integer.MAX_VALUE) { + indexes[maxI]++; + } + else { + return minDist; + } + } + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/VarintCodedSequence.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/VarintCodedSequence.java new file mode 100644 index 00000000..da98d4ce --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/VarintCodedSequence.java @@ -0,0 +1,257 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; + +import java.nio.ByteBuffer; +import java.util.Objects; + +public class VarintCodedSequence implements CodedSequence { + + private final ByteBuffer raw; + + private final int startPos; + private final int startLimit; + + public VarintCodedSequence(ByteBuffer buffer) { + this.raw = buffer; + + this.startPos = buffer.position(); + this.startLimit = buffer.limit(); + } + + public VarintCodedSequence(ByteBuffer buffer, int startPos, int startLimit) { + this.raw = buffer; + + this.startPos = startPos; + this.startLimit = startLimit; + } + + private static int requiredBufferSize(int[] values) { + int prev = 0; + int size = 0; + + for (int value : values) { + size += varintSize(value - prev); + prev = value; + } + + return size + varintSize(size + 1); + } + + private static int requiredBufferSize(IntList values) { + int prev = 0; + int size = 0; + + for (int i = 0; i < values.size(); i++) { + int value = values.getInt(i); + size += varintSize(value - prev); + prev = value; + } + + return size + varintSize(size + 1); + } + + private static int varintSize(int value) { + int bits = 32 - Integer.numberOfLeadingZeros(value); + return (bits + 6) / 7; + } + + public static VarintCodedSequence generate(IntList values) { + int bufferSize = requiredBufferSize(values); + ByteBuffer buffer = ByteBuffer.allocate(bufferSize); + + int prev = 0; + + encodeValue(buffer, values.size() + 1); + + for (int i = 0; i < values.size(); i++) { + int value = values.getInt(i); + int toEncode = value - prev; + assert toEncode > 0 : "Values must be strictly increasing"; + + encodeValue(buffer, toEncode); + + prev = value; + } + + buffer.flip(); + + return new VarintCodedSequence(buffer); + } + + public static VarintCodedSequence generate(int... values) { + int bufferSize = requiredBufferSize(values); + ByteBuffer buffer = ByteBuffer.allocate(bufferSize); + + int prev = 0; + + encodeValue(buffer, values.length + 1); + + for (int value : values) { + int toEncode = value - prev; + assert toEncode > 0 : "Values must be strictly increasing"; + + encodeValue(buffer, toEncode); + + prev = value; + } + + buffer.flip(); + + return new VarintCodedSequence(buffer); + } + + private static void encodeValue(ByteBuffer buffer, int value) { + if (value < (1<<7)) { + buffer.put((byte) value); + } + else if (value < (1<<14)) { + buffer.put((byte) (value >>> (7) | 0x80)); + buffer.put((byte) (value & 0x7F)); + } + else if (value < (1<<21)) { + buffer.put((byte) (value >>> (14) | 0x80)); + buffer.put((byte) (value >>> (7) | 0x80)); + buffer.put((byte) (value & 0x7F)); + } + else if (value < (1<<28)) { + buffer.put((byte) ((value >>> 21) | 0x80)); + buffer.put((byte) ((value >>> 14) | 0x80)); + buffer.put((byte) ((value >>> 7) | 0x80)); + buffer.put((byte) (value & 0x7F)); + } + else { + throw new IllegalArgumentException("Value too large to encode"); + } + } + + @Override + public byte[] bytes() { + return raw.array(); + } + + @Override + public IntIterator iterator() { + return new VarintSequenceIterator(buffer()); + } + + @Override + public IntIterator offsetIterator(int offset) { + return new VarintSequenceIterator(buffer(), offset); + } + + @Override + public IntList values() { + var buffer = buffer(); + + int val = 0; + int count = decodeValue(buffer) - 1; + + IntArrayList list = new IntArrayList(count); + + while (buffer.hasRemaining()) { + val += decodeValue(buffer); + list.add(val); + } + + return list; + } + + @Override + public ByteBuffer buffer() { + raw.position(startPos); + raw.limit(startLimit); + + return raw; + } + + @Override + public int bufferSize() { + return raw.capacity(); + } + + @Override + public int valueCount() { + var buffer = buffer(); + return decodeValue(buffer) - 1; + } + + private static int decodeValue(ByteBuffer buffer) { + // most common case gets a fast path, this is a fairly large performance win + // on average, something like 10-20% faster than not having this check + byte b = buffer.get(); + if ((b & 0x80) == 0) { + return b; + } + + int value = b & 0x7F; + do { + b = buffer.get(); + value = (value << 7) | (b & 0x7F); + } while ((b & 0x80) != 0); + + + return value; + } + + public static class VarintSequenceIterator implements IntIterator { + + private final ByteBuffer buffer; + int rem = 0; + private int last; + private int next = Integer.MIN_VALUE; + + public VarintSequenceIterator(ByteBuffer buffer, int zero) { + this.buffer = buffer; + if (zero == Integer.MIN_VALUE) { + throw new IllegalArgumentException("Integer.MIN_VALUE is a reserved offset that may not be used as zero point"); + } + + last = zero; + rem = decodeValue(buffer) - 1; + } + + public VarintSequenceIterator(ByteBuffer buffer) { + this(buffer, 0); + } + + // This is BitWriter.getGamma with more checks in place for streaming iteration + @Override + public boolean hasNext() { + if (next != Integer.MIN_VALUE) return true; + if (--rem < 0) return false; + + int delta = decodeValue(buffer); + + last += delta; + next = last; + + return true; + } + + @Override + public int nextInt() { + if (hasNext()) { + int ret = next; + next = Integer.MIN_VALUE; + return ret; + } + throw new ArrayIndexOutOfBoundsException("No more data to read"); + } + + + } + + public int hashCode() { + return values().hashCode(); + } + + public boolean equals(Object other) { + if (other instanceof CodedSequence cs) { + return Objects.equals(values(), cs.values()); + } + return false; + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java new file mode 100644 index 00000000..6c5c4759 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java @@ -0,0 +1,179 @@ +package nu.marginalia.sequence.io; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.ByteBuffer; + +/** A utility class for reading bits from a ByteBuffer + * out of alignment with octet boundaries + */ +public class BitReader { + private final ByteBuffer underlying; + private final Runnable refillCallback; + + private static final Logger logger = LoggerFactory.getLogger(BitReader.class); + + /** The current value being decoded */ + private long currentValue; + + /** Bit index in the current value */ + private int bitPosition; + + + /** Create a new BitReader for the given buffer. The supplied callback will be + * invoked when the underlying buffer is out of data. The callback should + * refill the buffer with more data. + */ + public BitReader(ByteBuffer buffer, Runnable refillCallback) { + this.underlying = buffer; + this.refillCallback = refillCallback; + this.bitPosition = 0; + this.currentValue = 0; + } + + /** Create a new BitReader for the given buffer */ + public BitReader(ByteBuffer buffer) { + this(buffer, () -> { throw new IllegalStateException("No more data to read and no re-fill callback provided"); }); + } + + /** Read the next bit from the buffer */ + public boolean getBit() { + if (bitPosition <= 0) { + readNext(); + } + + // Return the bit at the current position, then decrement the position + return (currentValue & (1L << (--bitPosition))) != 0; + } + + /** Read the next width bits from the buffer */ + public int get(int width) { + // Fast path for reading a full integer from the current value + if (bitPosition >= width) { + // We have enough bits in the current value to satisfy the request + int result = (int)(currentValue >>> (bitPosition - width)) & ~-(1<= 0) { // We have enough bits in the current value to satisfy the request + result |= ((int)(currentValue >>> dw)) & ~-(1< 0); + + return result; + } + + /** Read bits until a 1 is encountered */ + public int takeWhileZero() { + int result = 0; + + do { + // Ensure we have bits to read + if (bitPosition <= 0) { + readNext(); + } + + // Count the number of leading zeroes in the current value + int zeroes = Long.numberOfLeadingZeros(currentValue << (64 - bitPosition)); + + // Add the number of zeroes to the result, but cap it at the + // current bit position to avoid counting padding bits as zeroes + result += Math.min(bitPosition, zeroes); + + // Subtract the number of bits read from the current position + bitPosition -= zeroes; + + // If bit position is not positive, we've found a 1 and can stop + } while (bitPosition <= 0); + + return result; + } + + public int getGamma() { + int bits = takeWhileZero(); + int ret = get(bits + 1); + + // The highest bit in the gamma coded value must be set, we can use this invariant + // to detect data corruption early + assert (ret & (1 << bits)) != 0 : "Highest bit in gamma coded return value not set"; + + return ret; + } + + public int getDelta() { + int bits = getGamma(); + int ret = get(bits); + + // The highest bit in the delta coded value must be set, we can use this invariant + // to detect data corruption early + assert (ret & (1 << (bits-1))) != 0 : "Highest bit in delta coded return value not set"; + + return ret; + } + + public boolean hasMore() { + return bitPosition > 0 || underlying.hasRemaining(); + } + + private void readNext() { + int remainingCapacity = underlying.remaining(); + + if (remainingCapacity >= 8) { + currentValue = underlying.getLong(); + bitPosition = 64; + } + else if (remainingCapacity >= 4) { + currentValue = underlying.getInt() & 0xFFFF_FFFFL; + bitPosition = 32; + } + else if (remainingCapacity >= 2) { + currentValue = underlying.getShort() & 0xFFFF; + bitPosition = 16; + } + else if (remainingCapacity == 1) { + currentValue = underlying.get() & 0xFF; + bitPosition = 8; + } + else { // There's no more data to read! + refillCallback.run(); + if (underlying.hasRemaining()) { + readNext(); + } + else { + // We've attempted to re-fill the buffer, but there's still no data to read, so we fail to avoid + // blowing up the stack with recursion + throw new IllegalStateException("No more data to read after attempted re-fill of underlying buffer"); + } + + } + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java new file mode 100644 index 00000000..57455541 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitWriter.java @@ -0,0 +1,139 @@ +package nu.marginalia.sequence.io; + +import java.nio.ByteBuffer; + +/** A utility class for writing bits to a ByteBuffer + * out of alignment with octet boundaries + */ +public class BitWriter { + private final ByteBuffer underlying; + + /** The current value being encoded */ + private long currentValue; + /** Bit index in the current value */ + private int bitPosition; + + /** The total number of significant bytes that have been written to the buffer, + * the actual number of bytes may be larger than this value, but the trailing + * values should be ignored */ + private int totalMeaningfulBytes; + + public BitWriter(ByteBuffer workBuffer) { + this.underlying = workBuffer; + this.bitPosition = 0; + this.currentValue = 0; + this.totalMeaningfulBytes = 0; + + underlying.clear(); + } + + public void putBit(boolean value) { + if (value) { + currentValue = 1 | (currentValue << 1); + } + else { + currentValue <<= 1; + } + + // If we've exceeded the integer size, write it to the buffer + // and start over with the next integer + + if (++bitPosition == 64) { + underlying.putLong(currentValue); + totalMeaningfulBytes+=8; + + bitPosition = 0; + currentValue = 0; + } + } + + /** Write the lowest width bits of the value to the buffer */ + public void putBits(int value, int width) { + assert width <= 32 : "Attempting to write more than 32 bits from a single integer"; + + int rem = (64 - bitPosition); + + if (rem < width) { // The value is split between two integers + // write the first part of the byte + currentValue = (currentValue << rem) | (value >>> (width - rem)); + + // switch to the next integer + underlying.putLong(currentValue); + totalMeaningfulBytes+=8; + + // write the remaining part to currentValue + currentValue = value & ((1L << (width - rem)) - 1); + bitPosition = width - rem; + } + else { // The entire value fits in the current integer + currentValue <<= width; + currentValue |= (value & ((1L << width) - 1)); + bitPosition += width; + } + } + + static int numberOfSignificantBits(int value) { + // we could also do 1 + Integer.numberOfTrailingZeros(Integer.highestOneBit(value)) + // but it's doubtful it makes much of a difference either way + + return Integer.SIZE - Integer.numberOfLeadingZeros(value); + } + + /** Write the provided value in a Elias gamma-coded format, + * e.g. by first finding the number of significant bits, + * then writing that many zeroes, then the bits themselves + */ + public void putGamma(int value) { + assert value > 0 : "Attempting to write an Elias gamma coded value less than or equal to zero"; + + int bits = numberOfSignificantBits(value); + + putBits(0, bits - 1); + putBits(value, bits); + } + + /** Write the provided value in an Elias delta-coded format, + * e.g. by first finding the number of significant bits, + * then writing that many zeroes, then the bits themselves + */ + public void putDelta(int value) { + assert value > 0 : "Attempting to write an Elias delta coded value less than or equal to zero"; + + int bits = numberOfSignificantBits(value); + + assert bits >= 1; // invariant + + putGamma(bits); + putBits(value, bits); + } + + /** Flush the changes to the writer's internal buffer and + * return the buffer, ready for reading. If the internal buffer + * is intended to be re-used, the returned value should be copied + * to a new buffer by the caller. + */ + public ByteBuffer finish() { + finishLastByte(); + + underlying.position(0); + underlying.limit(totalMeaningfulBytes); + + return underlying; + } + + + /** Finish writing any partially written bit fields to the buffer */ + public void finishLastByte() { + // It's possible we have a few bits left over that have yet to be written + // to the underlying buffer. We need to write them out now. + + if (bitPosition > 0) { + totalMeaningfulBytes += bitPosition / 8 + ((bitPosition % 8 == 0) ? 0 : 1); + underlying.putLong(currentValue << (64 - bitPosition)); + } + + // Reset the bit position to reflect that we've written the last byte + bitPosition = 0; + } + +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/GammaCodedSequenceArrayColumn.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/GammaCodedSequenceArrayColumn.java new file mode 100644 index 00000000..ba31564e --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/GammaCodedSequenceArrayColumn.java @@ -0,0 +1,154 @@ +package nu.marginalia.sequence.slop; + +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.column.AbstractColumn; +import nu.marginalia.slop.column.AbstractObjectColumn; +import nu.marginalia.slop.column.ObjectColumnReader; +import nu.marginalia.slop.column.ObjectColumnWriter; +import nu.marginalia.slop.column.dynamic.VarintColumn; +import nu.marginalia.slop.desc.ColumnFunction; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.net.URI; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +/** Slop column extension for storing GammaCodedSequence objects. */ +public class GammaCodedSequenceArrayColumn extends AbstractObjectColumn, GammaCodedSequenceArrayColumn.Reader, GammaCodedSequenceArrayColumn.Writer> { + + private final VarintColumn groupsColumn; + private final GammaCodedSequenceColumn dataColumn; + + public GammaCodedSequenceArrayColumn(String name) { + this(name, StorageType.PLAIN); + } + + public GammaCodedSequenceArrayColumn(String name, StorageType storageType) { + super(name, + "gcs[]", + ByteOrder.nativeOrder(), + ColumnFunction.DATA, + storageType); + + groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType); + dataColumn = new GammaCodedSequenceColumn(name); + } + + public Writer createUnregistered(Path path, int page) throws IOException { + return new Writer( + dataColumn.createUnregistered(path, page), + groupsColumn.createUnregistered(path, page) + ); + } + + public Reader openUnregistered(URI uri, int page) throws IOException { + return new Reader( + dataColumn.openUnregistered(uri, page), + groupsColumn.openUnregistered(uri, page) + ); + } + + + public class Writer implements ObjectColumnWriter> { + private final VarintColumn.Writer groupsWriter; + private final GammaCodedSequenceColumn.Writer dataWriter; + + Writer(GammaCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter) + { + this.groupsWriter = groupsWriter; + this.dataWriter = dataWriter; + } + + @Override + public AbstractColumn columnDesc() { + return GammaCodedSequenceArrayColumn.this; + } + + @Override + public void put(List sequences) throws IOException { + groupsWriter.put(sequences.size()); + for (GammaCodedSequence sequence : sequences) { + dataWriter.put(sequence); + } + } + + public long position() { + return groupsWriter.position(); + } + + public void close() throws IOException { + dataWriter.close(); + groupsWriter.close(); + } + } + + public class Reader implements ObjectColumnReader> { + private final GammaCodedSequenceColumn.Reader dataReader; + private final VarintColumn.Reader groupsReader; + + public Reader(GammaCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) { + this.dataReader = dataReader; + this.groupsReader = groupsReader; + } + + @Override + public AbstractColumn columnDesc() { + return GammaCodedSequenceArrayColumn.this; + } + + @Override + public void skip(long positions) throws IOException { + int toSkip = 0; + for (int i = 0; i < positions; i++) { + toSkip += groupsReader.get(); + } + dataReader.skip(toSkip); + } + + @Override + public boolean hasRemaining() throws IOException { + return groupsReader.hasRemaining(); + } + + public long position() throws IOException { + return groupsReader.position(); + } + + @Override + public List get() throws IOException { + int count = groupsReader.get(); + var ret = new ArrayList(count); + + for (int i = 0; i < count; i++) { + ret.add(dataReader.get()); + } + + return ret; + } + + public List getData(ByteBuffer workArea) throws IOException { + int count = groupsReader.get(); + var ret = new ArrayList(count); + + for (int i = 0; i < count; i++) { + int start = workArea.position(); + dataReader.getData(workArea); + var slice = workArea.slice(start, workArea.position() - start); + ret.add(slice); + } + + return ret; + } + + + public void close() throws IOException { + dataReader.close(); + groupsReader.close(); + } + + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/GammaCodedSequenceColumn.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/GammaCodedSequenceColumn.java new file mode 100644 index 00000000..b94180ca --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/GammaCodedSequenceColumn.java @@ -0,0 +1,148 @@ +package nu.marginalia.sequence.slop; + +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.slop.column.AbstractColumn; +import nu.marginalia.slop.column.AbstractObjectColumn; +import nu.marginalia.slop.column.ObjectColumnReader; +import nu.marginalia.slop.column.ObjectColumnWriter; +import nu.marginalia.slop.column.dynamic.VarintColumn; +import nu.marginalia.slop.desc.ColumnFunction; +import nu.marginalia.slop.desc.StorageType; +import nu.marginalia.slop.storage.Storage; +import nu.marginalia.slop.storage.StorageReader; +import nu.marginalia.slop.storage.StorageWriter; + +import java.io.IOException; +import java.net.URI; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Path; + +/** Slop column extension for storing GammaCodedSequence objects. */ +public class GammaCodedSequenceColumn extends AbstractObjectColumn { + + private final VarintColumn indexColumn; + + public GammaCodedSequenceColumn(String name) { + this(name, StorageType.PLAIN); + } + + public GammaCodedSequenceColumn(String name, StorageType storageType) { + super(name, + "gamma", + ByteOrder.nativeOrder(), + ColumnFunction.DATA, + storageType); + + indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN); + } + + public Writer createUnregistered(Path path, int page) throws IOException { + return new Writer( + Storage.writer(path, this, page), + indexColumn.createUnregistered(path, page) + ); + } + + public Reader openUnregistered(URI uri, int page) throws IOException { + return new Reader( + Storage.reader(uri, this, page, false), + indexColumn.openUnregistered(uri, page) + ); + } + + public class Writer implements ObjectColumnWriter { + private final VarintColumn.Writer indexWriter; + private final StorageWriter storage; + + public Writer(StorageWriter storage, + VarintColumn.Writer indexWriter) + { + this.storage = storage; + + this.indexWriter = indexWriter; + } + + @Override + public AbstractColumn columnDesc() { + return GammaCodedSequenceColumn.this; + } + + @Override + public void put(GammaCodedSequence sequence) throws IOException { + var buffer = sequence.buffer(); + int length = buffer.remaining(); + + indexWriter.put(length); + storage.putBytes(buffer); + } + + public long position() { + return indexWriter.position(); + } + + public void close() throws IOException { + indexWriter.close(); + storage.close(); + } + } + + public class Reader implements ObjectColumnReader { + private final VarintColumn.Reader indexReader; + private final StorageReader storage; + + Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException { + this.storage = reader; + this.indexReader = indexReader; + } + + @Override + public AbstractColumn columnDesc() { + return GammaCodedSequenceColumn.this; + } + + @Override + public void skip(long positions) throws IOException { + for (int i = 0; i < positions; i++) { + int size = indexReader.get(); + storage.skip(size, 1); + } + } + + @Override + public boolean hasRemaining() throws IOException { + return indexReader.hasRemaining(); + } + + public long position() throws IOException { + return indexReader.position(); + } + + @Override + public GammaCodedSequence get() throws IOException { + int size = indexReader.get(); + + ByteBuffer dest = ByteBuffer.allocate(size); + storage.getBytes(dest); + dest.flip(); + + return new GammaCodedSequence(dest); + } + + public void getData(ByteBuffer workArea) throws IOException { + int size = indexReader.get(); + + int oldLimit = workArea.limit(); + workArea.limit(workArea.position() + size); + storage.getBytes(workArea); + workArea.limit(oldLimit); + } + + + public void close() throws IOException { + indexReader.close(); + storage.close(); + } + + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/VarintCodedSequenceArrayColumn.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/VarintCodedSequenceArrayColumn.java new file mode 100644 index 00000000..1d8141d7 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/VarintCodedSequenceArrayColumn.java @@ -0,0 +1,154 @@ +package nu.marginalia.sequence.slop; + +import nu.marginalia.sequence.VarintCodedSequence; +import nu.marginalia.slop.column.AbstractColumn; +import nu.marginalia.slop.column.AbstractObjectColumn; +import nu.marginalia.slop.column.ObjectColumnReader; +import nu.marginalia.slop.column.ObjectColumnWriter; +import nu.marginalia.slop.column.dynamic.VarintColumn; +import nu.marginalia.slop.desc.ColumnFunction; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.net.URI; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; + +/** Slop column extension for storing GammaCodedSequence objects. */ +public class VarintCodedSequenceArrayColumn extends AbstractObjectColumn, VarintCodedSequenceArrayColumn.Reader, VarintCodedSequenceArrayColumn.Writer> { + + private final VarintColumn groupsColumn; + private final VarintCodedSequenceColumn dataColumn; + + public VarintCodedSequenceArrayColumn(String name) { + this(name, StorageType.PLAIN); + } + + public VarintCodedSequenceArrayColumn(String name, StorageType storageType) { + super(name, + "vcs[]", + ByteOrder.nativeOrder(), + ColumnFunction.DATA, + storageType); + + groupsColumn = new VarintColumn(name, ColumnFunction.GROUP_LENGTH, storageType); + dataColumn = new VarintCodedSequenceColumn(name); + } + + public Writer createUnregistered(Path path, int page) throws IOException { + return new Writer( + dataColumn.createUnregistered(path, page), + groupsColumn.createUnregistered(path, page) + ); + } + + public Reader openUnregistered(URI uri, int page) throws IOException { + return new Reader( + dataColumn.openUnregistered(uri, page), + groupsColumn.openUnregistered(uri, page) + ); + } + + + public class Writer implements ObjectColumnWriter> { + private final VarintColumn.Writer groupsWriter; + private final VarintCodedSequenceColumn.Writer dataWriter; + + Writer(VarintCodedSequenceColumn.Writer dataWriter, VarintColumn.Writer groupsWriter) + { + this.groupsWriter = groupsWriter; + this.dataWriter = dataWriter; + } + + @Override + public AbstractColumn columnDesc() { + return VarintCodedSequenceArrayColumn.this; + } + + @Override + public void put(List sequences) throws IOException { + groupsWriter.put(sequences.size()); + for (VarintCodedSequence sequence : sequences) { + dataWriter.put(sequence); + } + } + + public long position() { + return groupsWriter.position(); + } + + public void close() throws IOException { + dataWriter.close(); + groupsWriter.close(); + } + } + + public class Reader implements ObjectColumnReader> { + private final VarintCodedSequenceColumn.Reader dataReader; + private final VarintColumn.Reader groupsReader; + + public Reader(VarintCodedSequenceColumn.Reader dataReader, VarintColumn.Reader groupsReader) { + this.dataReader = dataReader; + this.groupsReader = groupsReader; + } + + @Override + public AbstractColumn columnDesc() { + return VarintCodedSequenceArrayColumn.this; + } + + @Override + public void skip(long positions) throws IOException { + int toSkip = 0; + for (int i = 0; i < positions; i++) { + toSkip += groupsReader.get(); + } + dataReader.skip(toSkip); + } + + @Override + public boolean hasRemaining() throws IOException { + return groupsReader.hasRemaining(); + } + + public long position() throws IOException { + return groupsReader.position(); + } + + @Override + public List get() throws IOException { + int count = groupsReader.get(); + var ret = new ArrayList(count); + + for (int i = 0; i < count; i++) { + ret.add(dataReader.get()); + } + + return ret; + } + + public List getData(ByteBuffer workArea) throws IOException { + int count = groupsReader.get(); + var ret = new ArrayList(count); + + for (int i = 0; i < count; i++) { + int start = workArea.position(); + dataReader.getData(workArea); + var slice = workArea.slice(start, workArea.position() - start); + ret.add(slice); + } + + return ret; + } + + + public void close() throws IOException { + dataReader.close(); + groupsReader.close(); + } + + } +} diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/VarintCodedSequenceColumn.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/VarintCodedSequenceColumn.java new file mode 100644 index 00000000..a4363fc3 --- /dev/null +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/slop/VarintCodedSequenceColumn.java @@ -0,0 +1,148 @@ +package nu.marginalia.sequence.slop; + +import nu.marginalia.sequence.VarintCodedSequence; +import nu.marginalia.slop.column.AbstractColumn; +import nu.marginalia.slop.column.AbstractObjectColumn; +import nu.marginalia.slop.column.ObjectColumnReader; +import nu.marginalia.slop.column.ObjectColumnWriter; +import nu.marginalia.slop.column.dynamic.VarintColumn; +import nu.marginalia.slop.desc.ColumnFunction; +import nu.marginalia.slop.desc.StorageType; +import nu.marginalia.slop.storage.Storage; +import nu.marginalia.slop.storage.StorageReader; +import nu.marginalia.slop.storage.StorageWriter; + +import java.io.IOException; +import java.net.URI; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.file.Path; + +/** Slop column extension for storing GammaCodedSequence objects. */ +public class VarintCodedSequenceColumn extends AbstractObjectColumn { + + private final VarintColumn indexColumn; + + public VarintCodedSequenceColumn(String name) { + this(name, StorageType.PLAIN); + } + + public VarintCodedSequenceColumn(String name, StorageType storageType) { + super(name, + "vcs", + ByteOrder.nativeOrder(), + ColumnFunction.DATA, + storageType); + + indexColumn = new VarintColumn(name, ColumnFunction.DATA_LEN, StorageType.PLAIN); + } + + public Writer createUnregistered(Path path, int page) throws IOException { + return new Writer( + Storage.writer(path, this, page), + indexColumn.createUnregistered(path, page) + ); + } + + public Reader openUnregistered(URI uri, int page) throws IOException { + return new Reader( + Storage.reader(uri, this, page, false), + indexColumn.openUnregistered(uri, page) + ); + } + + public class Writer implements ObjectColumnWriter { + private final VarintColumn.Writer indexWriter; + private final StorageWriter storage; + + public Writer(StorageWriter storage, + VarintColumn.Writer indexWriter) + { + this.storage = storage; + + this.indexWriter = indexWriter; + } + + @Override + public AbstractColumn columnDesc() { + return VarintCodedSequenceColumn.this; + } + + @Override + public void put(VarintCodedSequence sequence) throws IOException { + var buffer = sequence.buffer(); + int length = buffer.remaining(); + + indexWriter.put(length); + storage.putBytes(buffer); + } + + public long position() { + return indexWriter.position(); + } + + public void close() throws IOException { + indexWriter.close(); + storage.close(); + } + } + + public class Reader implements ObjectColumnReader { + private final VarintColumn.Reader indexReader; + private final StorageReader storage; + + Reader(StorageReader reader, VarintColumn.Reader indexReader) throws IOException { + this.storage = reader; + this.indexReader = indexReader; + } + + @Override + public AbstractColumn columnDesc() { + return VarintCodedSequenceColumn.this; + } + + @Override + public void skip(long positions) throws IOException { + for (int i = 0; i < positions; i++) { + int size = indexReader.get(); + storage.skip(size, 1); + } + } + + @Override + public boolean hasRemaining() throws IOException { + return indexReader.hasRemaining(); + } + + public long position() throws IOException { + return indexReader.position(); + } + + @Override + public VarintCodedSequence get() throws IOException { + int size = indexReader.get(); + + ByteBuffer dest = ByteBuffer.allocate(size); + storage.getBytes(dest); + dest.flip(); + + return new VarintCodedSequence(dest); + } + + public void getData(ByteBuffer workArea) throws IOException { + int size = indexReader.get(); + + int oldLimit = workArea.limit(); + workArea.limit(workArea.position() + size); + storage.getBytes(workArea); + workArea.limit(oldLimit); + } + + + public void close() throws IOException { + indexReader.close(); + storage.close(); + } + + } +} diff --git a/code/libraries/coded-sequence/readme.md b/code/libraries/coded-sequence/readme.md new file mode 100644 index 00000000..c08b4645 --- /dev/null +++ b/code/libraries/coded-sequence/readme.md @@ -0,0 +1,49 @@ +The coded-sequence library offers tools for encoding sequences +of integers with a variable-length encoding. + +The Elias Gamma code is supported: +https://en.wikipedia.org/wiki/Elias_gamma_coding + +The `GammaCodedSequence` class stores a sequence of ascending +non-negative integers in a byte buffer. The encoding also +stores the length of the sequence (as a gamma-coded value), +which is used in decoding. + +Sequences are encoded with the `GammaCodedSequence.of()`-method, +and require a temporary buffer to work in. +```java +// allocate a temporary buffer to work in, this is reused +// for all operations and will not hold the final result +ByteBuffer workArea = ByteBuffer.allocate(1024); + +// create a new GammaCodedSequence with the given values +var gcs = GammaCodedSequence.of(workArea, 1, 3, 4, 7, 10); +``` + +The `GammaCodedSequence` class provides methods to query the +sequence, iterate over the values, and access the underlying +binary representation. + +```java +// query the sequence +int valueCount = gcs.valueCount(); +int bufferSize = gcs.bufferSize(); + +// iterate over the values +IntIterator iter = gcs.iterator(); +IntList values = gcs.values(); + +// access the underlying data (e.g. for writing) +byte[] bytes = gcs.bytes(); +ByteBuffer buffer = gcs.buffer(); +``` + +The `GammaCodedSequence` class also provides methods to decode +a sequence from a byte buffer or byte array. + +```java +// decode the data +var decodedGcs1 = new GammaCodedSequence(buffer); +var decodedGcs2 = new GammaCodedSequence(buffer, start, end); +var decodedGcs3 = new GammaCodedSequence(bytes); +``` \ No newline at end of file diff --git a/code/libraries/coded-sequence/src/jmh/java/nu/marginalia/bench/SequenceBenchmarks.java b/code/libraries/coded-sequence/src/jmh/java/nu/marginalia/bench/SequenceBenchmarks.java new file mode 100644 index 00000000..69ebbb3b --- /dev/null +++ b/code/libraries/coded-sequence/src/jmh/java/nu/marginalia/bench/SequenceBenchmarks.java @@ -0,0 +1,90 @@ +package nu.marginalia.bench; + +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.sequence.GammaCodedSequence; +import nu.marginalia.sequence.VarintCodedSequence; +import org.openjdk.jmh.annotations.*; + +import java.nio.ByteBuffer; + +public class SequenceBenchmarks { + + @State(Scope.Benchmark) + public static class SequenceState { + VarintCodedSequence vcs; + GammaCodedSequence gcs; + IntList list; + ByteBuffer workArea; + int[] arrayValues; + int[] valueBuffer; + public SequenceState() + { + valueBuffer = new int[128]; + + workArea = ByteBuffer.allocate(65536); + arrayValues = new int[] { 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100 }; + list = new IntArrayList(arrayValues); + vcs = VarintCodedSequence.generate(16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738); + gcs = GammaCodedSequence.generate(workArea, 16,21,24,28,66,71,76,83,87,98,101,106,113,115,119,122,143,148,159,164,167,177,182,211,223,242,245,250,273,275,280,289,292,300,307,322,330,338,345,371,397,402,411,420,427,430,433,437,440,448,451,481,490,513,522,555,571,573,585,597,606,613,634,638,640,644,656,660,666,683,689,692,696,709,712,718,727,731,735,738); + } + } + + @Fork(value = 1, warmups = 1) + @Warmup(iterations = 1) + @Benchmark + @BenchmarkMode(Mode.Throughput) + public int vcsDecode(SequenceState state) { + var iter = state.vcs.iterator(); + int sum = 0; + while (iter.hasNext()) { + sum += iter.nextInt(); + } + return sum; + } +// +// @Fork(value = 5, warmups = 5) +// @Warmup(iterations = 5) +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// public int listDecode2(SequenceState state) { +// var list = state.arrayValues; +// int sum = 0; +// for (int i = 0; i < list.length; i++) { +// sum += list[i]; +// } +// return sum; +// } + + + @Fork(value = 1, warmups = 1) + @Warmup(iterations = 1) + @Benchmark + @BenchmarkMode(Mode.Throughput) + public int gcsDecode(SequenceState state) { + var iter = state.gcs.iterator(); + int sum = 0; + while (iter.hasNext()) { + sum += iter.nextInt(); + } + return sum; + } + +// @Fork(value = 1, warmups = 1) +// @Warmup(iterations = 1) +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// public VarintCodedSequence vcsEncode(SequenceState state) { +// return VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100); +// } + +// @Fork(value = 1, warmups = 1) +// @Warmup(iterations = 1) +// @Benchmark +// @BenchmarkMode(Mode.Throughput) +// public GammaCodedSequence gcsEncode(SequenceState state) { +// return GammaCodedSequence.generate(state.workArea, 1, 3, 5, 16, 1024, 2048, 4096, 4098, 4100); +// } + + +} diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java new file mode 100644 index 00000000..5488ffb8 --- /dev/null +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitReaderTest.java @@ -0,0 +1,139 @@ +package nu.marginalia.sequence; + +import nu.marginalia.sequence.io.BitReader; +import nu.marginalia.sequence.io.BitWriter; +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; + +import static org.junit.jupiter.api.Assertions.*; + +class BitReaderTest { + + @Test + void getBit() { + var writer = new BitWriter(ByteBuffer.allocate(1024)); + writer.putBit(true); + writer.putBit(false); + writer.putBits(0, 32); + writer.putBit(true); + writer.putBit(false); + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + assertTrue(reader.getBit()); + assertFalse(reader.getBit()); + for (int i = 0; i < 32; i++) { + assertFalse(reader.getBit()); + } + assertTrue(reader.getBit()); + assertFalse(reader.getBit()); + } + + @Test + void getInByte() { + var writer = new BitWriter(ByteBuffer.allocate(1024)); + + writer.putBit(true); + writer.putBit(false); + + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + int val = reader.get(2); + assertEquals(0b10, val); + } + + @Test + void get() { + var writer = new BitWriter(ByteBuffer.allocate(1024)); + writer.putBit(true); + writer.putBit(false); + writer.putBits(0, 32); + writer.putBit(true); + writer.putBit(false); + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + int val = reader.get(4); + assertEquals(0b1000, val); + + val = reader.get(30); + assertEquals(0b000, val); + + val = reader.get(2); + assertEquals(0b10, val); + } + + @Test + void getSevens() { + // Fuzz test that probes int32 misalignments + var writer = new BitWriter(ByteBuffer.allocate(1024)); + + for (int i = 0; i < 729; i++) { + writer.putBit(true); + writer.putBit(false); + writer.putBit(false); + writer.putBit(true); + writer.putBit(false); + writer.putBit(false); + writer.putBit(true); + } + + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + + for (int i = 0; i < 729; i++) { + int val = reader.get(7); + assertEquals(0b1001001, val); + } + } + + @Test + void getSevens2() { + // Fuzz test that probes int32 misalignments + var writer = new BitWriter(ByteBuffer.allocate(1024)); + + for (int i = 0; i < 729; i++) { + writer.putBits(73, 7); + } + + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + + for (int i = 0; i < 729; i++) { + int val = reader.get(7); + assertEquals(0b1001001, val); + } + } + + @Test + public void testTakeWhileZero() { + var writer = new BitWriter(ByteBuffer.allocate(1024)); + writer.putBits(0, 4); + writer.putBit(true); + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + int val = reader.takeWhileZero(); + assertEquals(4, val); + assertTrue(reader.getBit()); + } + + @Test + public void testTakeWhileZeroOverInt64() { + var writer = new BitWriter(ByteBuffer.allocate(1024)); + writer.putBits(0, 32); + writer.putBits(0, 32); + writer.putBits(0, 2); + writer.putBit(true); + var buffer = writer.finish(); + + var reader = new BitReader(buffer); + int val = reader.takeWhileZero(); + assertEquals(66, val); + assertTrue(reader.getBit()); + } +} \ No newline at end of file diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java new file mode 100644 index 00000000..b5404ceb --- /dev/null +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/BitWriterTest.java @@ -0,0 +1,344 @@ +package nu.marginalia.sequence; + +import nu.marginalia.sequence.io.BitReader; +import nu.marginalia.sequence.io.BitWriter; +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; +import java.util.Random; + +import static org.junit.jupiter.api.Assertions.*; + +class BitWriterTest { + + @Test + public void testPutBitsFullByte() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBit(false); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(false); + + var out = writer.finish(); + + byte actual = out.get(0); + byte expected = (byte) 0b0111_1110; + + assertEquals(expected, actual); + assertEquals(1, out.limit()); + } + + @Test + public void testPutBitsPartialByte() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBit(true); + writer.putBit(false); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + + var out = writer.finish(); + + byte actual = out.get(0); + byte expected = (byte) 0b1011_1110; + + assertEquals(expected, actual, STR."was \{Integer.toBinaryString(actual & 0xFF)}"); + assertEquals(1, out.limit()); + } + + + @Test + public void testPutBitsOneAndAHalfByte() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBit(true); + writer.putBit(false); + writer.putBit(true); + writer.putBit(true); + + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(false); + + writer.putBit(true); + writer.putBit(true); + + var out = writer.finish(); + + assertEquals(2, out.limit()); + + byte actual1 = out.get(0); + byte actual2 = out.get(1); + byte expected1 = (byte) 0b1011_1110; + byte expected2 = (byte) 0b1100_0000; + + assertEquals(expected1, actual1, STR."was \{Integer.toBinaryString(actual1 & 0xFF)}"); + assertEquals(expected2, actual2, STR."was \{Integer.toBinaryString(actual2 & 0xFF)}"); + + } + + + @Test + public void testPutBitsIntOverflow() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + for (int i = 0; i < 4; i++) { + writer.putBit(true); + writer.putBit(false); + writer.putBit(true); + writer.putBit(true); + + writer.putBit(true); + writer.putBit(true); + writer.putBit(true); + writer.putBit(false); + } + writer.putBit(true); + writer.putBit(true); + + + var out = writer.finish(); + + assertEquals(5, out.limit()); + + for (int i = 0; i < 4; i++) { + byte actual1 = out.get(i); + byte expected1 = (byte) 0b1011_1110; + + assertEquals(expected1, actual1, STR."was \{Integer.toBinaryString(actual1 & 0xFF)}"); + } + + byte actual2 = out.get(4); + byte expected2 = (byte) 0b1100_0000; + + assertEquals(expected2, actual2, STR."was \{Integer.toBinaryString(actual2 & 0xFF)}"); + + } + + @Test + public void testPut1() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(1, 1); + var ret = writer.finish(); + assertEquals(1, ret.limit()); + assertEquals((byte)0b1000_0000, ret.get(0)); + } + + @Test + public void testPut4() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(1, 4); + var ret = writer.finish(); + assertEquals(1, ret.limit()); + assertEquals((byte)0b0001_0000, ret.get(0)); + } + + @Test + public void testPut8() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(3, 8); + var ret = writer.finish(); + assertEquals(1, ret.limit()); + assertEquals((byte)0b0000_0011, ret.get(0)); + } + + @Test + public void testPut8_2() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(~0, 8); + var ret = writer.finish(); + assertEquals(1, ret.limit()); + assertEquals((byte)0b1111_1111, ret.get(0)); + } + + @Test + public void testPut8_3() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(~0, 8); + writer.putBits(0, 8); + writer.putBits(~0, 8); + writer.putBits(1, 1); + + var ret = writer.finish(); + + assertEquals(4, ret.limit()); + assertEquals((byte)0b1111_1111, ret.get(0)); + assertEquals((byte)0, ret.get(1)); + assertEquals((byte)0b1111_1111, ret.get(2)); + assertEquals((byte)0b1000_0000, ret.get(3)); + } + + @Test + public void testIntOverflow() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(~0, 24); + writer.putBits(0, 16); + writer.putBits(1, 1); + + var ret = writer.finish(); + + assertEquals(6, ret.limit()); + assertEquals((byte)0b1111_1111, ret.get(0)); + assertEquals((byte)0b1111_1111, ret.get(1)); + assertEquals((byte)0b1111_1111, ret.get(2)); + assertEquals((byte)0, ret.get(3)); + assertEquals((byte)0, ret.get(4)); + assertEquals((byte)0b1000_0000, ret.get(5)); + } + + @Test + public void testIntOverflowMisaligned() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(0, 2); + writer.putBits(~0, 24); + writer.putBits(0, 16); + writer.putBits(1, 1); + + var ret = writer.finish(); + + assertEquals(6, ret.limit()); + assertEquals((byte)0b0011_1111, ret.get(0)); + assertEquals((byte)0b1111_1111, ret.get(1)); + assertEquals((byte)0b1111_1111, ret.get(2)); + assertEquals((byte)0b1100_0000, ret.get(3)); + assertEquals((byte)0, ret.get(4)); + assertEquals((byte)0b0010_0000, ret.get(5)); + } + + @Test + public void testFuzzCase1() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(1, 6); + writer.putBits(702, 11); + + var ret = writer.finish(); + + var reader = new BitReader(ret); + int a = reader.get(6); + int b = reader.get(11); + assertEquals(a, 1); + assertEquals(b, 702); + } + + @Test + public void testFuzzCase2() { + var buffer = ByteBuffer.allocate(1024); + var writer = new BitWriter(buffer); + + writer.putBits(0, 6); + writer.putBits(0, 2); + + var ret = writer.finish(); + + assertEquals(1, ret.limit()); + assertEquals(0, ret.get(0)); + + var reader = new BitReader(ret); + int a = reader.get(6); + int b = reader.get(2); + assertEquals(a, 0); + assertEquals(b, 0); + } + + @Test + void fuzz() { + Random r = new Random(); + + for (int i = 0; i < 1000; i++) { + var buffer = ByteBuffer.allocate(32); + var writer = new BitWriter(buffer); + int aw = r.nextInt(1, 31); + int bw = r.nextInt(1, 31); + int a = r.nextInt(0, 1< decoded = new ArrayList<>(); + List expected = List.of(1, 3, 5, 16, 32, 64); + + var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret); + while (sequence.hasNext()) { + decoded.add(sequence.nextInt()); + } + + assertEquals(expected, decoded); + } + + @Test + public void testCodecEmpty() { + var ret = GammaCodedSequence.encode(work, new int[] { }); + + List decoded = new ArrayList<>(); + List expected = List.of(); + + var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret); + while (sequence.hasNext()) { + decoded.add(sequence.nextInt()); + } + + assertEquals(expected, decoded); + } + @Test + public void valueCount() { + var ret = GammaCodedSequence.encode(work, new int[] { 1, 3, 5, 16, 32, 64 }); + var count = GammaCodedSequence.EliasGammaSequenceIterator.readCount(ret); + assertEquals(6, count); + } + + @Test + public void testCodec2() { + var ret = GammaCodedSequence.encode(work, new int[] { 1, 256 }); + + List decoded = new ArrayList<>(); + List expected = List.of(1, 256); + + var sequence = new GammaCodedSequence.EliasGammaSequenceIterator(ret); + while (sequence.hasNext()) { + decoded.add(sequence.nextInt()); + } + + + assertEquals(expected, decoded); + } + + @Test + public void fuzzTestCodec() { + Random r = new Random(); + for (int i = 0; i < 1000; i++) { + int[] sequence = new int[2]; + sequence[0] = 1; + sequence[1] = 1 + r.nextInt(1, 512); + + var ret = GammaCodedSequence.encode(work, sequence); + + List decoded = new ArrayList<>(); + List expected = IntStream.of(sequence).boxed().toList(); + + try { + var codedData = new GammaCodedSequence.EliasGammaSequenceIterator(ret); + while (codedData.hasNext()) { + decoded.add(codedData.nextInt()); + } + } + catch (Exception e) { + fail("Exception thrown for " + Arrays.toString(sequence)); + } + + assertEquals(expected, decoded, "Expected " + expected + " but got " + decoded + " for " + Arrays.toString(sequence)); + + System.out.println(Arrays.toString(sequence) + " ok"); + } + } + +} \ No newline at end of file diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java new file mode 100644 index 00000000..0db059c5 --- /dev/null +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java @@ -0,0 +1,97 @@ +package nu.marginalia.sequence; + +import it.unimi.dsi.fastutil.ints.IntList; +import org.junit.jupiter.api.Test; + +import java.nio.ByteBuffer; + +import static org.junit.jupiter.api.Assertions.*; + +class SequenceOperationsTest { + + @Test + void intersectSequencesSingle() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator())); + } + + @Test + void intersectSequencesTrivialMatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 1); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void intersectSequencesTrivialMismatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2); + + assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void intersectSequencesOffsetMatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 3); + + assertTrue(SequenceOperations.intersectSequences(seq1.offsetIterator(0), seq2.offsetIterator(-2))); + } + + @Test + void intersectSequencesDeepMatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void intersectSequencesDeepMatch3() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14); + GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9); + + assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator())); + } + + + + @Test + void intersectSequencesDeepMatch3findIntersections() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 10, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14); + GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10); + + assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.values(), seq2.values(), seq3.values())); + } + + + @Test + void intersectSequencesDeepMismatch() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 14); + + assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator())); + } + + @Test + void testMinDistance() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 11, 80, 160); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 20, 50, 100); + GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 30, 60, 90); + + assertEquals(19, SequenceOperations.minDistance(new IntList[]{seq1.values(), seq2.values(), seq3.values()})); + } +} \ No newline at end of file diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/VarintCodedSequenceTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/VarintCodedSequenceTest.java new file mode 100644 index 00000000..67554b04 --- /dev/null +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/VarintCodedSequenceTest.java @@ -0,0 +1,50 @@ +package nu.marginalia.sequence; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.*; + +class VarintCodedSequenceTest { + @Test + public void testSimple() { + var sequence = VarintCodedSequence.generate(1, 3, 5, 16, 1024, 2048, 40000, 268435446); + + assertEquals(8, sequence.valueCount()); + + var values = sequence.values(); + System.out.println(values); + assertEquals(1, values.getInt(0)); + assertEquals(3, values.getInt(1)); + assertEquals(5, values.getInt(2)); + assertEquals(16, values.getInt(3)); + assertEquals(1024, values.getInt(4)); + assertEquals(2048, values.getInt(5)); + assertEquals(40000, values.getInt(6)); + assertEquals(268435446, values.getInt(7)); + + + var iter = sequence.iterator(); + assertEquals(1, iter.nextInt()); + assertEquals(3, iter.nextInt()); + assertEquals(5, iter.nextInt()); + assertEquals(16, iter.nextInt()); + assertEquals(1024, iter.nextInt()); + assertEquals(2048, iter.nextInt()); + assertEquals(40000, iter.nextInt()); + assertEquals(268435446, iter.nextInt()); + + } + + @Test + public void testEmpty() { + var sequence = VarintCodedSequence.generate(); + + assertEquals(0, sequence.valueCount()); + + var values = sequence.values(); + assertTrue(values.isEmpty()); + + var iter = sequence.iterator(); + assertFalse(iter.hasNext()); + } +} \ No newline at end of file diff --git a/code/libraries/easy-lsh/java/nu/marginalia/lsh/EasyLSH.java b/code/libraries/easy-lsh/java/nu/marginalia/lsh/EasyLSH.java index f380e9c5..e2a6238f 100644 --- a/code/libraries/easy-lsh/java/nu/marginalia/lsh/EasyLSH.java +++ b/code/libraries/easy-lsh/java/nu/marginalia/lsh/EasyLSH.java @@ -3,7 +3,7 @@ package nu.marginalia.lsh; /** This is a very simple locality sensitive hash for collections of Java objects. *

* The resulting LSH is a 64 bit value, whose hamming distance is a measure - * of the similarity of the two collections, where smaller similarities imply + * of the similarity of the two collections, where a smaller value implies * similarity. *

* It hinges on a lot of relatively sketchy assumptions about Object$hashCode(). diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java b/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java deleted file mode 100644 index 622c3b8c..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordDictionary.java +++ /dev/null @@ -1,46 +0,0 @@ -package nu.marginalia.language; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.HashSet; -import java.util.Objects; -import java.util.Set; - -public class WordDictionary { - private final Set words; - private static final Logger logger = LoggerFactory.getLogger(WordDictionary.class); - - private WordDictionary(Set words) { - this.words = words; - } - - public static WordDictionary fromClasspathResource(String resourceName) { - var set = new HashSet(200, 0.5f); - - try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream(resourceName), - "Could not load word frequency table"); - var br = new BufferedReader(new InputStreamReader(resource)) - ) { - while (true) { - String s = br.readLine(); - - if (s == null) break; - if (s.isBlank()) continue; - - set.add(s.trim()); - } - } catch (IOException e) { - logger.warn("Failed to load resource " + resourceName, e); - } - - return new WordDictionary(set); - } - - public boolean contains(String str) { - return words.contains(str); - } -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java index dbc8c9c8..c0990f22 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/WordPatterns.java @@ -1,21 +1,12 @@ package nu.marginalia.language; -import org.apache.commons.lang3.StringUtils; - /** Logic for deciding which words are eligible to be keywords. - *

- * This is in dire need of oversight. Here be towering dragons with names, - * a skull next to their HP bar, and their own Mick Gordon soundtrack just - * for the battle. - * */ public class WordPatterns { public static final int MIN_WORD_LENGTH = 1; public static final int MAX_WORD_LENGTH = 64; public static final String WORD_TOKEN_JOINER = "_"; - private static final WordDictionary stopWords = - WordDictionary.fromClasspathResource("dictionary/en-stopwords"); /** Run checks on the word and exclude terms with too many special characters */ @@ -57,27 +48,13 @@ public class WordPatterns { return true; } + // Stopword exclusion has been moved to the index. We just filter out + // junk words here now. public static boolean isStopWord(String s) { - if (s.length() < MIN_WORD_LENGTH) { - return true; - } - if (!isNotJunkWord(s)) { return true; } - String sLc; - if (StringUtils.isAllLowerCase(s)) { - sLc = s; - } - else { - sLc = s.toLowerCase(); - } - - if (stopWords.contains(sLc)) { - return true; - } - return false; } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java b/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java index 5eca3c76..3956680d 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/filter/FasttextLanguagePredictionModel.java @@ -4,6 +4,7 @@ import com.github.jfasttext.JFastText; import nu.marginalia.LanguageModels; import nu.marginalia.language.model.DocumentLanguageData; +/** A language prediction model that uses a FastText model to predict the language of a document */ public class FasttextLanguagePredictionModel implements LanguagePredictionModel { private final JFastText jft = new JFastText(); @@ -13,7 +14,7 @@ public class FasttextLanguagePredictionModel implements LanguagePredictionModel @Override public double predictEnglish(DocumentLanguageData dld) { - if ("__label__en".equals(jft.predict(dld.text))) { + if ("__label__en".equals(jft.predict(dld.text()))) { return 1.0; } return 0.; diff --git a/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java b/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java index bf390e45..12dd45f9 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/filter/LanguageFilter.java @@ -1,5 +1,7 @@ package nu.marginalia.language.filter; +import com.google.inject.Inject; +import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; import nu.marginalia.language.encoding.UnicodeRanges; @@ -8,8 +10,6 @@ import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.inject.Inject; -import com.google.inject.Singleton; import java.util.Optional; import java.util.Set; @@ -31,10 +31,10 @@ public class LanguageFilter { if(LANGUAGE_DETECTION_MODEL_VERSION < 0) return 1.0; if (LANGUAGE_DETECTION_MODEL_VERSION == 1) { - return languagePredictionModel2.predictEnglish(dld); + return languagePredictionModel1.predictEnglish(dld); } else if (LANGUAGE_DETECTION_MODEL_VERSION == 2) { - return languagePredictionModel1.predictEnglish(dld); + return languagePredictionModel2.predictEnglish(dld); } else { // default is to run both models if (languagePredictionModel1.predictEnglish(dld) < 0.1) diff --git a/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java b/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java index 8b3c4567..6b72088f 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/filter/UngaBungaLanguagePredictionModel.java @@ -8,10 +8,14 @@ import java.util.HashSet; import java.util.Objects; import java.util.Set; +/** A simple language prediction model that uses a dictionary of English words + * and requires that a certain fraction of the words in the document present in that + * dictionary for the document to be considered English. + * */ public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel { private static final Set englishWords = new HashSet<>(); - public UngaBungaLanguagePredictionModel() throws Exception { + public UngaBungaLanguagePredictionModel() { try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("dictionary/en-1000"), "Could not load word frequency table"); var br = new BufferedReader(new InputStreamReader(resource)) @@ -33,7 +37,7 @@ public class UngaBungaLanguagePredictionModel implements LanguagePredictionModel Set seenWords = new HashSet<>(); int englishCount = 0; - for (var sent : dld.sentences) { + for (var sent : dld) { for (var word : sent.wordsLowerCase) { if (seenWords.add(word) && englishWords.contains(word)) { englishCount++; diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java index 2ad53f7a..6ef10c25 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentLanguageData.java @@ -1,57 +1,65 @@ package nu.marginalia.language.model; -import gnu.trove.map.hash.TObjectIntHashMap; -import lombok.AllArgsConstructor; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.lsh.EasyLSH; +import org.jetbrains.annotations.NotNull; -import java.util.Arrays; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; import java.util.stream.Stream; -/** +/** Holds the sentences and text of a document, decorated with + * HTML tags, POS tags, and other information. + * * @see SentenceExtractor */ -@AllArgsConstructor -public class DocumentLanguageData { - public final DocumentSentence[] sentences; - public final DocumentSentence[] titleSentences; - public final TObjectIntHashMap wordCount; - public final String text; +public record DocumentLanguageData(List sentences, String text) implements Iterable { - /** for test convenience */ - public static DocumentLanguageData empty() { - return new DocumentLanguageData( - new DocumentSentence[0], - new DocumentSentence[0], - new TObjectIntHashMap<>(), - "" - ); + public DocumentLanguageData(List sentences, + String text) + { + this.sentences = Collections.unmodifiableList(sentences); + this.text = text; + } + + public List findSentencesForTag(HtmlTag tag) { + return stream().filter(s -> s.htmlTags.contains(tag)).toList(); + } + + public int numSentences() { + return sentences.size(); } public int totalNumWords() { int ret = 0; - for (int i = 0; i < sentences.length; i++) { - ret += sentences[i].length(); + + for (DocumentSentence sent : sentences) { + ret += sent.length(); } + return ret; } - public Stream streamLowerCase() { - return Arrays.stream(sentences).map(sent -> sent.wordsLowerCase).flatMap(Arrays::stream); - } - - public Stream stream() { - return Arrays.stream(sentences).map(sent -> sent.words).flatMap(Arrays::stream); - } - public long localitySensitiveHashCode() { var hash = new EasyLSH(); for (var sent : sentences) { - for (var word : sent) { - hash.addUnordered(word.word()); + for (var word : sent.wordsLowerCase) { + hash.addUnordered(word); } } return hash.get(); } + + @NotNull + @Override + public Iterator iterator() { + return sentences.iterator(); + } + + public Stream stream() { + return sentences.stream(); + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java index b9b4abce..d6b42911 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/model/DocumentSentence.java @@ -2,52 +2,65 @@ package nu.marginalia.language.model; import nu.marginalia.language.WordPatterns; +import nu.marginalia.language.sentence.tag.HtmlTag; import org.jetbrains.annotations.NotNull; import java.lang.ref.SoftReference; import java.util.BitSet; +import java.util.EnumSet; import java.util.Iterator; import java.util.StringJoiner; -public class DocumentSentence implements Iterable{ - public final String originalSentence; - public final String[] words; - public final int[] separators; - public final String[] wordsLowerCase; - public final String[] posTags; - public final String[] stemmedWords; - public final String[] ngrams; - public final String[] ngramStemmed; +/** Represents a sentence in a document, with POS tags, HTML tags, and other information + * about the words in the sentence. + * */ +public class DocumentSentence implements Iterable { + /** A span of words in a sentence */ + public final String[] wordsLowerCase; + public final String[] stemmedWords; + public final String[] posTags; + + /** A set of HTML tags that surround the sentence */ + public final EnumSet htmlTags; + + /** A bitset indicating whether the word is a stop word */ private final BitSet isStopWord; + /** A bitset indicating whether the word is capitalized */ + private final BitSet isCapitalized; + + /** A bitset indicating whether the word is all caps */ + private final BitSet isAllCaps; + + // Encode whether the words are separated by a comma or a space, + // where false = COMMA, true = SPACE + private final BitSet separators; + public SoftReference keywords; - public DocumentSentence(String originalSentence, - String[] words, - int[] separators, + public DocumentSentence(BitSet separators, String[] wordsLowerCase, String[] posTags, String[] stemmedWords, - String[] ngrams, - String[] ngramsStemmed + EnumSet htmlTags, + BitSet isCapitalized, + BitSet isAllCaps ) { - this.originalSentence = originalSentence; - this.words = words; this.separators = separators; this.wordsLowerCase = wordsLowerCase; this.posTags = posTags; this.stemmedWords = stemmedWords; + this.htmlTags = htmlTags; + this.isCapitalized = isCapitalized; + this.isAllCaps = isAllCaps; - isStopWord = new BitSet(words.length); + isStopWord = new BitSet(wordsLowerCase.length); - this.ngrams = ngrams; - this.ngramStemmed = ngramsStemmed; - - for (int i = 0; i < words.length; i++) { - if (WordPatterns.isStopWord(words[i])) + for (int i = 0; i < wordsLowerCase.length; i++) { + if (WordPatterns.isStopWord(wordsLowerCase[i])) isStopWord.set(i); } } @@ -55,14 +68,23 @@ public class DocumentSentence implements Iterable{ public boolean isStopWord(int idx) { return isStopWord.get(idx); } - public void setIsStopWord(int idx, boolean val) { - if (val) - isStopWord.set(idx); - else - isStopWord.clear(); - } + public int length() { - return words.length; + return wordsLowerCase.length; + } + + public boolean isCapitalized(int i) { + return isCapitalized.get(i); + } + public boolean isAllCaps(int i) { + return isAllCaps.get(i); + } + + public boolean isSeparatorSpace(int i) { + return separators.get(i); + } + public boolean isSeparatorComma(int i) { + return !separators.get(i); } public String constructWordFromSpan(WordSpan span) { @@ -140,9 +162,9 @@ public class DocumentSentence implements Iterable{ @Override public String toString() { StringBuilder sb = new StringBuilder(); - for (int i = 0; i < words.length; i++) { - sb.append(words[i]).append('[').append(posTags[i]).append(']'); - if (separators[i] == WordSeparator.COMMA) { + for (int i = 0; i < wordsLowerCase.length; i++) { + sb.append(wordsLowerCase[i]).append('[').append(posTags[i]).append(']'); + if (isSeparatorComma(i)) { sb.append(','); } else { @@ -176,11 +198,9 @@ public class DocumentSentence implements Iterable{ this.pos = pos; } - public String word() { return words[pos]; } public String wordLowerCase() { return wordsLowerCase[pos]; } public String posTag() { return posTags[pos]; } public String stemmed() { return stemmedWords[pos]; } - public int separator() { return separators[pos]; } public boolean isStopWord() { return DocumentSentence.this.isStopWord(pos); } public WordRep rep() { diff --git a/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java b/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java deleted file mode 100644 index 3476073f..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/model/WordSeparator.java +++ /dev/null @@ -1,6 +0,0 @@ -package nu.marginalia.language.model; - -public final class WordSeparator { - public static final int COMMA = 0; - public static final int SPACE = 1; -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java index 8dd818a3..0a9ef2e3 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractor.java @@ -1,23 +1,23 @@ package nu.marginalia.language.sentence; import com.github.datquocnguyen.RDRPOSTagger; -import gnu.trove.map.hash.TObjectIntHashMap; +import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.sentence.tag.HtmlStringTagger; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.language.sentence.tag.HtmlTaggedString; +import nu.marginalia.segmentation.NgramLexicon; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; import org.apache.commons.lang3.StringUtils; -import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; -import org.jsoup.nodes.Element; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.google.inject.Inject; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; @@ -38,14 +38,13 @@ public class SentenceExtractor { private final PorterStemmer porterStemmer = new PorterStemmer(); private static final Logger logger = LoggerFactory.getLogger(SentenceExtractor.class); - private static final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); private static final SentencePreCleaner sentencePrecleaner = new SentencePreCleaner(); /* Truncate sentences longer than this. This is mostly a defense measure against malformed data * that might otherwise use an undue amount of processing power. 250 words is about 10X longer than * this comment. */ - private static final int MAX_SENTENCE_LENGTH = 250; - private static final int MAX_TEXT_LENGTH = 65536; + static final int MAX_SENTENCE_LENGTH = 250; + static final int MAX_SENTENCE_COUNT = 1000; @SneakyThrows @Inject public SentenceExtractor(LanguageModels models) @@ -76,218 +75,221 @@ public class SentenceExtractor { } public DocumentLanguageData extractSentences(Document doc) { - var clone = doc.clone(); - tagCleaner.clean(clone); + final List textSentences = new ArrayList<>(); + + final List taggedStrings = HtmlStringTagger.tagDocumentStrings(doc); - final String text = asText(clone); - final DocumentSentence[] textSentences = extractSentencesFromString(text); + final int totalTextLength = taggedStrings.stream().mapToInt(HtmlTaggedString::length).sum(); + final StringBuilder documentText = new StringBuilder(totalTextLength + taggedStrings.size()); - String title = getTitle(clone, textSentences); + for (var taggedString : taggedStrings) { + String text = taggedString.string(); - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts, text); + textSentences.addAll( + extractSentencesFromString(text, taggedString.tags()) + ); + + if (documentText.isEmpty()) { + documentText.append(text); + } + else { + documentText.append(' ').append(text); + } + } + + return new DocumentLanguageData(textSentences, documentText.toString()); } public DocumentLanguageData extractSentences(String text, String title) { - final DocumentSentence[] textSentences = extractSentencesFromString(text); + var textSentences = extractSentencesFromString(text, EnumSet.noneOf(HtmlTag.class)); + var titleSentences = extractSentencesFromString(title.toLowerCase(), EnumSet.of(HtmlTag.TITLE)); - TObjectIntHashMap counts = calculateWordCounts(textSentences); - var titleSentences = extractSentencesFromString(title.toLowerCase()); - return new DocumentLanguageData(textSentences, titleSentences, counts, text); + List combined = new ArrayList<>(textSentences.size() + titleSentences.size()); + combined.addAll(titleSentences); + combined.addAll(textSentences); + + return new DocumentLanguageData( + combined, + text); } - private String getTitle(Document doc, DocumentSentence[] textSentences) { - String title = doc.getElementsByTag("title").text() + " . " + - Optional.ofNullable(doc.getElementsByTag("h1").first()).map(Element::text).orElse(""); + public DocumentSentence extractSentence(String text, EnumSet htmlTags) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text, MAX_SENTENCE_LENGTH); - if (title.trim().length() < 3) { - title = doc.getElementsByTag("h2").text(); - } + String[] words = wordsAndSeps.words(); + BitSet seps = wordsAndSeps.separators(); + String[] lc = new String[words.length]; + String[] stemmed = new String[words.length]; - if (title.trim().length() < 3) { - for (DocumentSentence textSentence : textSentences) { - if (textSentence.length() > 0) { - title = textSentence.originalSentence.toLowerCase(); - break; - } + BitSet isCapitalized = new BitSet(words.length); + BitSet isAllCaps = new BitSet(words.length); + + for (int i = 0; i < words.length; i++) { + lc[i] = stripPossessive(words[i].toLowerCase()); + + if (words[i].length() > 0 && Character.isUpperCase(words[i].charAt(0))) { + isCapitalized.set(i); } - } - - return title; - } - - - @NotNull - private TObjectIntHashMap calculateWordCounts(DocumentSentence[] textSentences) { - TObjectIntHashMap counts = new TObjectIntHashMap<>(textSentences.length*10, 0.5f, 0); - - for (var sent : textSentences) { - for (var word : sent.stemmedWords) { - counts.adjustOrPutValue(word, 1, 1); - } - } - return counts; - } - - public DocumentSentence extractSentence(String text) { - var wordsAndSeps = SentenceSegmentSplitter.splitSegment(text); - - var words = wordsAndSeps.words; - var seps = wordsAndSeps.separators; - var lc = SentenceExtractorStringUtils.toLowerCaseStripPossessive(wordsAndSeps.words); - - List ngrams = ngramLexicon.findSegmentsStrings(2, 12, words); - - String[] ngramsWords = new String[ngrams.size()]; - String[] ngramsStemmedWords = new String[ngrams.size()]; - for (int i = 0; i < ngrams.size(); i++) { - String[] ngram = ngrams.get(i); - - StringJoiner ngramJoiner = new StringJoiner("_"); - StringJoiner stemmedJoiner = new StringJoiner("_"); - for (String s : ngram) { - ngramJoiner.add(s); - stemmedJoiner.add(porterStemmer.stem(s)); + if (StringUtils.isAllUpperCase(words[i])) { + isAllCaps.set(i); } - ngramsWords[i] = ngramJoiner.toString(); - ngramsStemmedWords[i] = stemmedJoiner.toString(); - } - - - return new DocumentSentence( - SentenceExtractorStringUtils.sanitizeString(text), - words, - seps, - lc, - rdrposTagger.tagsForEnSentence(words), - stemSentence(lc), - ngramsWords, - ngramsStemmedWords - ); - } - - public DocumentSentence[] extractSentencesFromString(String text) { - String[] sentences; - - String textNormalizedSpaces = SentenceExtractorStringUtils.normalizeSpaces(text); - - try { - sentences = sentenceDetector.sentDetect(textNormalizedSpaces); - } - catch (Exception ex) { - // shitty fallback logic - sentences = StringUtils.split(textNormalizedSpaces, '.'); - } - - sentences = sentencePrecleaner.clean(sentences); - - final String[][] tokens = new String[sentences.length][]; - final int[][] separators = new int[sentences.length][]; - final String[][] posTags = new String[sentences.length][]; - final String[][] tokensLc = new String[sentences.length][]; - final String[][] stemmedWords = new String[sentences.length][]; - - for (int i = 0; i < tokens.length; i++) { - - var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sentences[i]); - tokens[i] = wordsAndSeps.words; - separators[i] = wordsAndSeps.separators; - - if (tokens[i].length > MAX_SENTENCE_LENGTH) { - tokens[i] = Arrays.copyOf(tokens[i], MAX_SENTENCE_LENGTH); - separators[i] = Arrays.copyOf(separators[i], MAX_SENTENCE_LENGTH); - } - - for (int j = 0; j < tokens[i].length; j++) { - while (tokens[i][j].endsWith(".")) { - tokens[i][j] = StringUtils.removeEnd(tokens[i][j], "."); - } - } - } - - for (int i = 0; i < tokens.length; i++) { - posTags[i] = rdrposTagger.tagsForEnSentence(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - tokensLc[i] = SentenceExtractorStringUtils.toLowerCaseStripPossessive(tokens[i]); - } - - for (int i = 0; i < tokens.length; i++) { - stemmedWords[i] = stemSentence(tokensLc[i]); - } - - DocumentSentence[] ret = new DocumentSentence[sentences.length]; - for (int i = 0; i < ret.length; i++) { - String fullString; - - if (i == 0) { - fullString = SentenceExtractorStringUtils.sanitizeString(sentences[i]); - } - else { - fullString = ""; - } - - List ngrams = ngramLexicon.findSegmentsStrings(2, 12, tokens[i]); - - String[] ngramsWords = new String[ngrams.size()]; - String[] ngramsStemmedWords = new String[ngrams.size()]; - - for (int j = 0; j < ngrams.size(); j++) { - String[] ngram = ngrams.get(j); - - StringJoiner ngramJoiner = new StringJoiner("_"); - StringJoiner stemmedJoiner = new StringJoiner("_"); - for (String s : ngram) { - ngramJoiner.add(s); - stemmedJoiner.add(porterStemmer.stem(s)); - } - - ngramsWords[j] = ngramJoiner.toString(); - ngramsStemmedWords[j] = stemmedJoiner.toString(); - } - - - ret[i] = new DocumentSentence(fullString, - tokens[i], - separators[i], - tokensLc[i], - posTags[i], - stemmedWords[i], - ngramsWords, - ngramsStemmedWords - ); - } - return ret; - } - - private String[] stemSentence(String[] strings) { - String[] stemmed = new String[strings.length]; - for (int i = 0; i < stemmed.length; i++) { - var sent = SentenceExtractorStringUtils.stripPossessive(strings[i]); try { - stemmed[i] = porterStemmer.stem(sent); + stemmed[i] = porterStemmer.stem(lc[i]); } catch (Exception ex) { stemmed[i] = "NN"; // ??? } } - return stemmed; + + return new DocumentSentence( + seps, + lc, + rdrposTagger.tagsForEnSentence(words), + stemmed, + htmlTags, + isCapitalized, + isAllCaps + ); } - public String asText(Document dc) { - String text = dc.getElementsByTag("body").text(); + public List extractSentencesFromString(String text, EnumSet htmlTags) { + String[] sentences; - if (text.length() > MAX_TEXT_LENGTH) { - return text.substring(0, MAX_TEXT_LENGTH); + // Normalize spaces + + text = normalizeSpaces(text); + + // Split into sentences + + try { + sentences = sentenceDetector.sentDetect(text); + } + catch (Exception ex) { + // shitty fallback logic + sentences = StringUtils.split(text, '.'); + } + + sentences = sentencePrecleaner.clean(sentences); + + // Truncate the number of sentences if it exceeds the maximum, to avoid + // excessive processing time on malformed data + + if (sentences.length > MAX_SENTENCE_COUNT) { + sentences = Arrays.copyOf(sentences, MAX_SENTENCE_COUNT); + } + + final boolean isNaturalLanguage = htmlTags.stream().noneMatch(tag -> tag.nonLanguage); + + List ret = new ArrayList<>(sentences.length); + + if (isNaturalLanguage) { + // Natural language text; do POS tagging and stemming + + for (String sent : sentences) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH); + var tokens = wordsAndSeps.words(); + var separators = wordsAndSeps.separators(); + var posTags = rdrposTagger.tagsForEnSentence(tokens); + var tokensLc = new String[tokens.length]; + var stemmed = new String[tokens.length]; + + BitSet isCapitalized = new BitSet(tokens.length); + BitSet isAllCaps = new BitSet(tokens.length); + + for (int i = 0; i < tokens.length; i++) { + if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) { + isCapitalized.set(i); + } + if (StringUtils.isAllUpperCase(tokens[i])) { + isAllCaps.set(i); + } + + var originalVal = tokens[i]; + var newVal = stripPossessive(originalVal.toLowerCase()); + + if (Objects.equals(originalVal, newVal)) { + tokensLc[i] = originalVal; + } else { + tokensLc[i] = newVal; + } + + try { + stemmed[i] = porterStemmer.stem(tokens[i]); + } + catch (Exception ex) { + stemmed[i] = "NN"; // ??? + } + } + ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isCapitalized, isAllCaps)); + } } else { - return text.substring(0, (int) (text.length() * 0.95)); + // non-language text, e.g. program code; don't bother with POS tagging or stemming + // as this is not likely to be useful + + for (String sent : sentences) { + var wordsAndSeps = SentenceSegmentSplitter.splitSegment(sent, MAX_SENTENCE_LENGTH); + var tokens = wordsAndSeps.words(); + var separators = wordsAndSeps.separators(); + var posTags = new String[tokens.length]; + Arrays.fill(posTags, "X"); // Placeholder POS tag + var tokensLc = new String[tokens.length]; + var stemmed = new String[tokens.length]; + + BitSet isCapitalized = new BitSet(tokens.length); + BitSet isAllCaps = new BitSet(tokens.length); + + for (int i = 0; i < tokensLc.length; i++) { + var originalVal = tokens[i]; + + if (tokens[i].length() > 0 && Character.isUpperCase(tokens[i].charAt(0))) { + isCapitalized.set(i); + } + if (StringUtils.isAllUpperCase(tokens[i])) { + isAllCaps.set(i); + } + + if (StringUtils.isAllLowerCase(originalVal)) { + tokensLc[i] = originalVal; + } else { + tokensLc[i] = originalVal.toLowerCase(); + } + stemmed[i] = tokensLc[i]; // we don't stem non-language words + } + + ret.add(new DocumentSentence(separators, tokensLc, posTags, stemmed, htmlTags, isAllCaps, isCapitalized)); + } + } + + + return ret; } + public static String normalizeSpaces(String s) { + if (s.indexOf('\t') >= 0) { + s = s.replace('\t', ' '); + } + if (s.indexOf('\n') >= 0) { + s = s.replace('\n', ' '); + } + return s; + } + + public static String stripPossessive(String s) { + int end = s.length(); + + if (s.endsWith("'")) { + return s.substring(0, end-1); + } + + if (s.endsWith("'s") || s.endsWith("'S")) { + return s.substring(0, end-2); + } + + return s; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java deleted file mode 100644 index 63cd12e7..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorHtmlTagCleaner.java +++ /dev/null @@ -1,40 +0,0 @@ -package nu.marginalia.language.sentence; - -import org.jsoup.nodes.Document; -import org.jsoup.nodes.TextNode; - -import java.util.regex.Pattern; - -public class SentenceExtractorHtmlTagCleaner { - public final int MAX_CODE_TAG_LENGTH = 32; - public final Pattern codeTagJunkPattern = Pattern.compile("(\\.|<|>|<|>|\\([^)]*\\)[;]?$)"); - - public void clean(Document doc) { - cleanCodeTags(doc); - - doc.select("nav,form,input,code,body>title").remove(); - - // Create "sentences" out of elements that sometimes lack a period at the end to help - // NLP work better - doc.select("li,h1,h2,h3,h4,h5,h6,td,th,p,div,title").forEach(e -> e.appendText(". ")); - doc.select("br,hr").forEach(e -> e.prependText(". ")); - } - - private void cleanCodeTags(Document doc) { - for (var codeTag : doc.getElementsByTag("code")) { - var text = codeTag.text(); - - if (text.length() <= MAX_CODE_TAG_LENGTH) { - codeTag.replaceWith(new TextNode(trimCodeTagContents(text))); - } - else { - codeTag.remove(); - } - - } - } - - private String trimCodeTagContents(String text) { - return codeTagJunkPattern.matcher(text).replaceAll(" "); - } -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java deleted file mode 100644 index 41f27c24..00000000 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceExtractorStringUtils.java +++ /dev/null @@ -1,93 +0,0 @@ -package nu.marginalia.language.sentence; - -import java.util.Arrays; -import java.util.Objects; - -public class SentenceExtractorStringUtils { - - public static String sanitizeString(String s) { - char[] newChars = new char[s.length()]; - int pi = 0; - boolean changed = false; - for (int i = 0; i < newChars.length; i++) { - char c = s.charAt(i); - if (!isBadChar(c)) { - newChars[pi++] = c; - } - else { - changed = true; - newChars[pi++] = ' '; - } - } - - if (changed) { - s = new String(newChars, 0, pi); - } - - if (s.startsWith(".")) { - s = s.substring(1); - } - - if (s.isBlank()) { - return ""; - } - - return s; - - } - - private static boolean isBadChar(char c) { - if (c >= 'a' && c <= 'z') return false; - if (c >= 'A' && c <= 'Z') return false; - if (c >= '0' && c <= '9') return false; - if ("_#@.".indexOf(c) >= 0) return false; - if (c >= '\u00C0' && c <= '\u00D6') return false; - if (c >= '\u00D8' && c <= '\u00F6') return false; - if (c >= '\u00F8' && c <= '\u00FF') return false; - - return true; - } - - public static String normalizeSpaces(String s) { - if (s.indexOf('\t') >= 0) { - s = s.replace('\t', ' '); - } - if (s.indexOf('\n') >= 0) { - s = s.replace('\n', ' '); - } - return s; - } - - - public static String toLowerCaseStripPossessive(String word) { - String val = stripPossessive(word).toLowerCase(); - - if (Objects.equals(val, word)) { - return word; - } - - return val; - } - - public static String[] toLowerCaseStripPossessive(String[] words) { - String[] lc = new String[words.length]; - Arrays.setAll(lc, i ->SentenceExtractorStringUtils.toLowerCaseStripPossessive(words[i])); - return lc; - } - - public static String stripPossessive(String s) { - int end = s.length(); - - if (s.endsWith("'")) { - return s.substring(0, end-1); - } - - if (s.endsWith("'s") || s.endsWith("'S")) { - return s.substring(0, end-2); - } - - return s; - } - - -} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java index c8d7ec39..4fbcd061 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentencePreCleaner.java @@ -7,12 +7,9 @@ import java.util.regex.Pattern; public class SentencePreCleaner { private static final Pattern splitPattern = Pattern.compile("( -|- |\\|)"); - private final int maxSentenceCount = 250; - private final int maxTotalLength = 20 * maxSentenceCount; public String[] clean(String[] sentences) { - int totalLength = 0; int sentenceCount = 0; List sentenceList = new ArrayList<>(); @@ -20,10 +17,9 @@ public class SentencePreCleaner { if (s.isBlank()) continue; - totalLength+=s.length(); sentenceCount++; - if (totalLength > maxTotalLength && sentenceCount++ > maxSentenceCount) { + if (sentenceCount++ > SentenceExtractor.MAX_SENTENCE_COUNT) { break; } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java index 7a0b49be..531f5189 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/SentenceSegmentSplitter.java @@ -2,25 +2,18 @@ package nu.marginalia.language.sentence; import com.google.common.base.CharMatcher; import gnu.trove.list.array.TIntArrayList; -import lombok.AllArgsConstructor; -import lombok.Getter; import nu.marginalia.language.encoding.AsciiFlattener; -import nu.marginalia.language.model.WordSeparator; import java.util.ArrayList; +import java.util.BitSet; import java.util.List; import java.util.regex.Pattern; -import static nu.marginalia.language.WordPatterns.*; +import static nu.marginalia.language.WordPatterns.MAX_WORD_LENGTH; public class SentenceSegmentSplitter { - @AllArgsConstructor - @Getter - public static class SeparatedSentence { - String[] words; - int[] separators; - } + public record SeparatedSentence(String[] words, BitSet separators) { } private static final CharMatcher noiseCharacterMatcher = CharMatcher.anyOf("/*-"); @@ -43,7 +36,7 @@ public class SentenceSegmentSplitter { * @param segment The sentence to split * @return A list of words and separators */ - public static SeparatedSentence splitSegment(String segment) { + public static SeparatedSentence splitSegment(String segment, int maxLength) { String flatSegment = AsciiFlattener.flattenUnicode(segment); var matcher = wordBreakPattern.matcher(flatSegment); @@ -77,7 +70,7 @@ public class SentenceSegmentSplitter { } List ret = new ArrayList<>(words.size()); - TIntArrayList seps = new TIntArrayList(words.size()); + BitSet seps = new BitSet(separators.size()); String[] parts = words.toArray(String[]::new); for (int i = 0; i < parts.length; i++) { @@ -89,7 +82,9 @@ public class SentenceSegmentSplitter { continue; ret.add(parts[i]); - seps.add(separators.getQuick(i)); + if (separators.getQuick(i) > 0) { + seps.set(i); + } } for (int i = 0; i < ret.size(); i++) { @@ -101,13 +96,26 @@ public class SentenceSegmentSplitter { if (part.endsWith("'") && part.length() > 1) { ret.set(i, part.substring(0, part.length()-1)); } + while (part.endsWith(".")) { + part = part.substring(0, part.length()-1); + ret.set(i, part); + } + } + + if (ret.size() > maxLength) { + ret.subList(maxLength, ret.size()).clear(); + seps = seps.get(0, maxLength); } return new SeparatedSentence( ret.toArray(String[]::new), - seps.toArray() + seps ); } + public static final class WordSeparator { + public static final int COMMA = 0; + public static final int SPACE = 1; + } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java new file mode 100644 index 00000000..d6cd823d --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java @@ -0,0 +1,120 @@ +package nu.marginalia.language.sentence.tag; + +import org.jsoup.nodes.Document; +import org.jsoup.nodes.Element; +import org.jsoup.nodes.Node; +import org.jsoup.nodes.TextNode; +import org.jsoup.select.NodeVisitor; + +import java.util.*; + +/** A class that tags strings in an HTML document with the HTML + * tags that are active at that point in the document. + */ +public class HtmlStringTagger implements NodeVisitor { + private List tagStack = new ArrayList<>(8); + private Set stackTags = new HashSet<>(8); + private StringBuilder currentString = new StringBuilder(256); + private List output = new ArrayList<>(); + + public static List tagDocumentStrings(Document document) { + var tagger = new HtmlStringTagger(); + document.traverse(tagger); + return tagger.getOutput(); + } + + List getOutput() { + List compactedOutput = new ArrayList<>(output.size()); + + for (var ts : output) { + if (compactedOutput.isEmpty()) { + compactedOutput.add(ts); + } + else { + var last = compactedOutput.getLast(); + if (last.tags().equals(ts.tags())) { + last.append(ts.string()); + } + else { + compactedOutput.add(ts); + } + } + } + + return output; + } + + @Override + public void head(Node node, int i) { + if (node instanceof Element el) { + String tagName = el.tagName(); + switch (tagName) { + case "script" -> pushTag(HtmlTag.SCRIPT, el); + case "style" -> pushTag(HtmlTag.STYLE, el); + case "input", "select", "form", "button" -> pushTag(HtmlTag.FORM, el); + case "code", "pre" -> pushTag(HtmlTag.CODE, el); + case "title" -> pushTag(HtmlTag.TITLE, el); + case "a" -> pushTag(HtmlTag.ANCHOR, el); + case "nav", "header", "footer" -> pushTag(HtmlTag.NAV, el); + case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el); + } + } + else if (node instanceof TextNode tn) { + if (shouldProcess()) { + String tnText = tn.text(); + if (!tnText.isBlank()) { + currentString = currentString.append(' ').append(tnText.trim()); + } + } + } + } + + @Override + public void tail(Node node, int i) { + if (!(node instanceof Element el)) + return; + + if (stackTags.remove(el)) { + output.add(new HtmlTaggedString(currentString, EnumSet.copyOf(tagStack))); + tagStack.removeLast(); + currentString = new StringBuilder(); + } + else if ("#root".equals(el.tagName())) { + closeOngoingTag(); + } + } + + private void pushTag(HtmlTag tag, Element el) { + closeOngoingTag(); + + tagStack.add(tag); + stackTags.add(el); + } + + private void closeOngoingTag() { + if (currentString.isEmpty()) { + return; + } + + EnumSet tags; + if (tagStack.isEmpty()) { + tags = EnumSet.noneOf(HtmlTag.class); + } + else { + tags = EnumSet.copyOf(tagStack); + } + + output.add(new HtmlTaggedString(currentString, tags)); + currentString = new StringBuilder(); + } + + public boolean shouldProcess() { + for (var tag : tagStack) { + if (tag.exclude) { + return false; + } + } + return true; + } + +} \ No newline at end of file diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java new file mode 100644 index 00000000..42521de2 --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java @@ -0,0 +1,49 @@ +package nu.marginalia.language.sentence.tag; + +import java.util.Arrays; + +public enum HtmlTag { + ANCHOR((byte) 'a', false, false), + TITLE((byte) 't', false, false), + HEADING((byte) 'h', false, false), + CODE((byte) 'c', false, true), + NAV((byte) 'n', false, false), + + // pseudo-tags for internal use, + BODY((byte) 'b', false, false), + EXTERNAL_LINKTEXT((byte) 'x', false, false), + + // excluded tags must be put last! + FORM((byte) 0, true, false), + SCRIPT((byte) 0, true, false), + STYLE((byte) 0, true, false), + ; + + public final byte code; + public final boolean exclude; + public final boolean nonLanguage; + + HtmlTag(byte code, boolean exclude, boolean nonLanguage) { + this.code = code; + this.exclude = exclude; + this.nonLanguage = nonLanguage; + } + + // This is a bit of a hack to get the included tags in the order they are defined in the enum + public static final HtmlTag[] includedTags; + + static { + HtmlTag[] values = values(); + includedTags = new HtmlTag[(int) Arrays.stream(values).filter(tag -> !tag.exclude).count()]; + + for (int i = 0; i < values.length; i++) { + if (i != values[i].ordinal()) { + throw new IllegalStateException("Excluded tags must be put last"); + } + + if (!values()[i].exclude) { + includedTags[i] = values()[i]; + } + } + } +} diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java new file mode 100644 index 00000000..80e8f4ee --- /dev/null +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTaggedString.java @@ -0,0 +1,33 @@ +package nu.marginalia.language.sentence.tag; + +import java.util.EnumSet; + +public class HtmlTaggedString { + private StringBuilder string; + private final EnumSet tags; + + public HtmlTaggedString(StringBuilder string, EnumSet tags) { + this.tags = tags; + this.string = string; + } + + public String string() { + return string.toString(); + } + + public EnumSet tags() { + return tags; + } + + public void append(String s) { + string.append(' ').append(s); + } + + public String toString() { + return "[" + tags.toString() + ":" + string.toString() + "]"; + } + + public int length() { + return string.length(); + } +} diff --git a/code/libraries/language-processing/resources/dictionary/en-stopwords b/code/libraries/language-processing/resources/dictionary/en-stopwords deleted file mode 100644 index d97db17c..00000000 --- a/code/libraries/language-processing/resources/dictionary/en-stopwords +++ /dev/null @@ -1,172 +0,0 @@ -i -a -e.g -i.e -the -of -and -in -to -was -is -for -on -as -with -by -he -that -at -from -his -it -an -were -we've -we're -which -are -this -also -be -had -or -has -first -their -after -its -new -but -who -her -not -she -she's -they -have -been -other -when -during -all -into -there -time -may -more -school -years -over -only -would -later -most -where -between -some -up -city -about -such -him -then -made -out -state -three -while -used -can -under -known -many -year -part -became -these -than -team -no -second -including -being -through -before -both -however -how -until -well -since -them -de -each -same -found -so -use -now -end -if -age -day -any -due -did -own -led -off -do -you -you're -young -without -take -described -site -royal -services -radio -together -social -force -northern -per -we -my -want -your -seem -else's -don't -me -couldn't -what -me -doesn't -can't -isn't -i've -it's -it -i'm -. -.. -... -.... -..... -...... -....... -........ -......... -.......... -will -us -much -our -what -what's -often -few -lot \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java b/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java deleted file mode 100644 index dc21d379..00000000 --- a/code/libraries/language-processing/test/nu/marginalia/language/encoding/SentenceExtractorHtmlTagCleanerTest.java +++ /dev/null @@ -1,28 +0,0 @@ -package nu.marginalia.language.encoding; - -import nu.marginalia.language.sentence.SentenceExtractorHtmlTagCleaner; -import org.jsoup.Jsoup; -import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.*; - -class SentenceExtractorHtmlTagCleanerTest { - - final SentenceExtractorHtmlTagCleaner tagCleaner = new SentenceExtractorHtmlTagCleaner(); - - public String cleanTag(String text) { - var doc = Jsoup.parse(text); - tagCleaner.clean(doc); - return doc.text(); - } - - @Test - public void testBriefCodeTag() { - assertEquals("hello", cleanTag("hello")); - assertEquals("System out println", cleanTag("System.out.println")); - assertEquals("hello", cleanTag("hello()")); - assertEquals("hello", cleanTag("<hello>")); - assertEquals("hello", cleanTag("hello(p,q)")); - assertEquals("hello", cleanTag("hello(p,q);")); - } -} \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java index e4679db7..38ccbe12 100644 --- a/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/SentenceExtractorTest.java @@ -1,14 +1,17 @@ package nu.marginalia.language.sentence; import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.tag.HtmlTag; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; import java.io.IOException; +import java.util.EnumSet; import java.util.Objects; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertEquals; class SentenceExtractorTest { private static SentenceExtractor sentenceExtractor; @@ -20,26 +23,25 @@ class SentenceExtractorTest { @Test void testParen() { - var dld = sentenceExtractor.extractSentence("I am (very) tall"); + var dld = sentenceExtractor.extractSentence("I am (very) tall", EnumSet.noneOf(HtmlTag.class)); System.out.println(dld); } @Test void testPolishArtist() { - var dld = sentenceExtractor.extractSentence("Uklański"); + var dld = sentenceExtractor.extractSentence("Uklański", EnumSet.noneOf(HtmlTag.class)); - assertEquals(1, dld.words.length); - assertEquals("Uklanski", dld.words[0]); + assertEquals(1, dld.wordsLowerCase.length); assertEquals("uklanski", dld.wordsLowerCase[0]); } @Test void testJava() { - var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API"); + var dld = sentenceExtractor.extractSentence("Foreign Function & Memory API", EnumSet.noneOf(HtmlTag.class)); - assertEquals(4, dld.words.length); - assertArrayEquals(new String[] {"Foreign", "Function", "Memory", "API"}, dld.words); + assertEquals(4, dld.wordsLowerCase.length); + assertArrayEquals(new String[] {"foreign", "function", "memory", "api"}, dld.wordsLowerCase); } @Test @@ -50,7 +52,7 @@ class SentenceExtractorTest { { var doc = Jsoup.parse(new String(resource.readAllBytes())); var dld = sentenceExtractor.extractSentences(doc); - for (var sent : dld.sentences) { + for (var sent : dld) { System.out.println(sent); } @@ -67,7 +69,7 @@ class SentenceExtractorTest { { var doc = Jsoup.parse(new String(resource.readAllBytes())); var dld = sentenceExtractor.extractSentences(doc); - for (var sent : dld.sentences) { + for (var sent : dld) { System.out.println(sent); } @@ -77,10 +79,9 @@ class SentenceExtractorTest { } @Test void testApostrophe() { - var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun"); - assertEquals(7, dld.words.length); + var dld = sentenceExtractor.extractSentence("duke nuke 'em's big ol' big gun", EnumSet.noneOf(HtmlTag.class)); + assertEquals(7, dld.wordsLowerCase.length); - assertArrayEquals(new String[] { "duke", "nuke", "em's", "big", "ol", "big", "gun"}, dld.words); assertArrayEquals(new String[] { "duke", "nuke", "em", "big", "ol", "big", "gun"}, dld.wordsLowerCase); } } \ No newline at end of file diff --git a/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java b/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java new file mode 100644 index 00000000..d550ee1e --- /dev/null +++ b/code/libraries/language-processing/test/nu/marginalia/language/sentence/tag/HtmlStringTaggerTest.java @@ -0,0 +1,29 @@ +package nu.marginalia.language.sentence.tag; + +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Test; + +class HtmlStringTaggerTest { + @Test + public void test() { + String html = """ + + + + T Example + + +

H1 Example

+

This is an example.

+

Here is more text.

+

And more text with a link and more text.

+ #include <stdlib.h> +

Good bye

+ + """; + var visitor = new HtmlStringTagger(); + Jsoup.parse(html).traverse(visitor); + + visitor.getOutput().forEach(ts -> System.out.println(ts.string() + " " + ts.tags())); + } +} \ No newline at end of file diff --git a/code/libraries/message-queue/build.gradle b/code/libraries/message-queue/build.gradle index 2cfe41c1..c6ce03c9 100644 --- a/code/libraries/message-queue/build.gradle +++ b/code/libraries/message-queue/build.gradle @@ -34,6 +34,7 @@ dependencies { testImplementation project(':code:common:db') testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/libraries/next-prime/readme.md b/code/libraries/next-prime/readme.md deleted file mode 100644 index a6b2a134..00000000 --- a/code/libraries/next-prime/readme.md +++ /dev/null @@ -1,4 +0,0 @@ -# Next Prime Util - -This is a brute force prime sieve. If finding many (or large) primes quickly -is important to you, don't use code like this. \ No newline at end of file diff --git a/code/libraries/next-prime/test/nu/marginalia/util/NextPrimeUtilTest.java b/code/libraries/next-prime/test/nu/marginalia/util/NextPrimeUtilTest.java deleted file mode 100644 index 381490cf..00000000 --- a/code/libraries/next-prime/test/nu/marginalia/util/NextPrimeUtilTest.java +++ /dev/null @@ -1,29 +0,0 @@ -package nu.marginalia.util; - -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; - -class NextPrimeUtilTest { - - @Test - void isPrime() { - Assertions.assertTrue(NextPrimeUtil.isPrime(1)); - Assertions.assertTrue(NextPrimeUtil.isPrime(2)); - Assertions.assertTrue(NextPrimeUtil.isPrime(3)); - Assertions.assertFalse(NextPrimeUtil.isPrime(4)); - Assertions.assertTrue(NextPrimeUtil.isPrime(5)); - Assertions.assertFalse(NextPrimeUtil.isPrime(6)); - Assertions.assertTrue(NextPrimeUtil.isPrime(7)); - Assertions.assertFalse(NextPrimeUtil.isPrime(8)); - Assertions.assertFalse(NextPrimeUtil.isPrime(9)); - Assertions.assertFalse(NextPrimeUtil.isPrime(10)); - Assertions.assertTrue(NextPrimeUtil.isPrime(11)); - } - - @Test - void nextPrime() { - System.out.println(NextPrimeUtil.nextPrime(1L<<31, -1)); - System.out.println(NextPrimeUtil.nextPrime(1L<<31, 1)); - - } -} \ No newline at end of file diff --git a/code/index/test/nu/marginalia/index/util/TestUtil.java b/code/libraries/test-helpers/java/nu/marginalia/test/TestUtil.java similarity index 56% rename from code/index/test/nu/marginalia/index/util/TestUtil.java rename to code/libraries/test-helpers/java/nu/marginalia/test/TestUtil.java index 651dd316..808dfcf7 100644 --- a/code/index/test/nu/marginalia/index/util/TestUtil.java +++ b/code/libraries/test-helpers/java/nu/marginalia/test/TestUtil.java @@ -1,4 +1,4 @@ -package nu.marginalia.index.util; +package nu.marginalia.test; import java.io.File; @@ -9,18 +9,28 @@ import java.util.Arrays; public class TestUtil { public static void clearTempDir(Path path) { + if (!Files.exists(path)) + return; + if (Files.isDirectory(path)) { - for (File f : path.toFile().listFiles()) { - File[] files = f.listFiles(); - if (files != null) { - Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + var contents = path.toFile().listFiles(); + + for (File f : contents) { + if (f.isDirectory()) { + File[] files = f.listFiles(); + if (files != null) { + Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir); + } + } + else { + System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")"); + f.delete(); } - System.out.println("Deleting " + f); - f.delete(); } } - - System.out.println("Deleting " + path + " (" + fileSize(path) + ")"); + else { + System.out.println("Deleting " + path + " (" + fileSize(path) + ")"); + } path.toFile().delete(); } diff --git a/code/process-models/crawl-spec/build.gradle b/code/process-models/crawl-spec/build.gradle deleted file mode 100644 index 2737e54a..00000000 --- a/code/process-models/crawl-spec/build.gradle +++ /dev/null @@ -1,32 +0,0 @@ -plugins { - id 'java' - - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation project(':third-party:parquet-floor') - implementation project(':code:common:config') - implementation project(':code:common:db') - implementation project(':code:common:linkdb') - - implementation libs.notnull - implementation libs.trove - implementation libs.bundles.parquet - implementation libs.bundles.mariadb - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/process-models/crawl-spec/readme.md b/code/process-models/crawl-spec/readme.md deleted file mode 100644 index cd59f23c..00000000 --- a/code/process-models/crawl-spec/readme.md +++ /dev/null @@ -1,16 +0,0 @@ -# Crawl Spec - -A crawl spec is a list of domains to be crawled. It is a parquet file with the following columns: - -- `domain`: The domain to be crawled -- `crawlDepth`: The depth to which the domain should be crawled -- `urls`: A list of known URLs to be crawled - -Crawl specs are used to define the scope of a crawl in the absence of known domains. - -The [CrawlSpecRecord](java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java) class is -used to represent a record in the crawl spec. - -The [CrawlSpecRecordParquetFileReader](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java) -and [CrawlSpecRecordParquetFileWriter](java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java) -classes are used to read and write the crawl spec parquet files. diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java deleted file mode 100644 index dae53224..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileReader.java +++ /dev/null @@ -1,37 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.processed.DocumentRecord; -import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; -import nu.marginalia.model.processed.DocumentRecordMetadataProjection; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.stream.Stream; - -public class DocumentRecordParquetFileReader { - - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DocumentRecord.newHydrator())); - } - - @NotNull - public static Stream streamKeywordsProjection(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DocumentRecordKeywordsProjection.newHydrator()), - DocumentRecordKeywordsProjection.requiredColumns() - ); - } - - @NotNull - public static Stream streamMetadataProjection(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DocumentRecordMetadataProjection.newHydrator()), - DocumentRecordMetadataProjection.requiredColumns() - ); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java deleted file mode 100644 index 62eec879..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DocumentRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.processed.DocumentRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class DocumentRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public DocumentRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(DocumentRecord.schema, - file.toFile(), DocumentRecord.newDehydrator()); - } - - public void write(DocumentRecord domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java deleted file mode 100644 index efa109cc..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileReader.java +++ /dev/null @@ -1,30 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.processed.DomainLinkRecord; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import java.util.stream.Stream; - -public class DomainLinkRecordParquetFileReader { - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainLinkRecord.newHydrator())); - } - - @NotNull - public static Set getDestDomainNames(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainLinkRecord.newDestDomainHydrator()), - List.of("dest")) - .collect(Collectors.toSet()); - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java deleted file mode 100644 index 28cf3aa0..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainLinkRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.processed.DomainLinkRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class DomainLinkRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public DomainLinkRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(DomainLinkRecord.schema, - file.toFile(), DomainLinkRecord.newDehydrator()); - } - - public void write(DomainLinkRecord domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java deleted file mode 100644 index a0714557..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileReader.java +++ /dev/null @@ -1,31 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.HydratorSupplier; -import blue.strategic.parquet.ParquetReader; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.model.processed.DomainWithIp; -import org.jetbrains.annotations.NotNull; - -import java.io.IOException; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.Stream; - -public class DomainRecordParquetFileReader { - - @NotNull - public static Stream stream(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainRecord.newHydrator())); - } - - @NotNull - public static List getBasicDomainInformation(Path path) throws IOException { - return ParquetReader.streamContent(path.toFile(), - HydratorSupplier.constantly(DomainRecord.newDomainNameHydrator()), - List.of("domain", "ip")) - .toList(); - } - - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java deleted file mode 100644 index 31c59582..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/DomainRecordParquetFileWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package nu.marginalia.io.processed; - -import blue.strategic.parquet.ParquetWriter; -import nu.marginalia.model.processed.DomainRecord; - -import java.io.IOException; -import java.nio.file.Path; - -public class DomainRecordParquetFileWriter implements AutoCloseable { - private final ParquetWriter writer; - - public DomainRecordParquetFileWriter(Path file) throws IOException { - writer = ParquetWriter.writeFile(DomainRecord.schema, - file.toFile(), DomainRecord.newDehydrator()); - } - - public void write(DomainRecord domainData) throws IOException { - writer.write(domainData); - } - - public void close() throws IOException { - writer.close(); - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java b/code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java deleted file mode 100644 index fafb393f..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/io/processed/ProcessedDataFileNames.java +++ /dev/null @@ -1,73 +0,0 @@ -package nu.marginalia.io.processed; - -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; - -public class ProcessedDataFileNames { - public static Path documentFileName(Path base, int batchNumber) { - return base.resolve(String.format("document%04d.parquet", batchNumber)); - } - public static Path domainFileName(Path base, int batchNumber) { - return base.resolve(String.format("domain%04d.parquet", batchNumber)); - } - public static Path domainLinkFileName(Path base, int batchNumber) { - return base.resolve(String.format("domain-link%04d.parquet", batchNumber)); - } - - public static List listDocumentFiles(Path base, int untilBatch) { - List ret = new ArrayList<>(untilBatch); - - for (int i = 0; i < untilBatch; i++) { - Path maybe = documentFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - } - - return ret; - } - - public static List listDomainFiles(Path base, int untilBatch) { - List ret = new ArrayList<>(untilBatch); - - for (int i = 0; i < untilBatch; i++) { - Path maybe = domainFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - } - - return ret; - } - - public static List listDomainFiles(Path base) { - List ret = new ArrayList<>(); - - for (int i = 0;; i++) { - Path maybe = domainFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - else { - break; - } - } - - return ret; - } - - public static List listDomainLinkFiles(Path base, int untilBatch) { - List ret = new ArrayList<>(untilBatch); - - for (int i = 0; i < untilBatch; i++) { - Path maybe = domainLinkFileName(base, i); - if (Files.exists(maybe)) { - ret.add(maybe); - } - } - - return ret; - } -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java deleted file mode 100644 index c90df7ee..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecord.java +++ /dev/null @@ -1,171 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import lombok.*; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.Types; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.ArrayList; -import java.util.List; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DocumentRecord { - @NotNull - public String domain; - @NotNull - public String url; - - public int ordinal; - - @NotNull - public String state; - @Nullable - public String stateReason; - - @Nullable - public String title; - @Nullable - public String description; - public int htmlFeatures; - @Nullable - public String htmlStandard; - - public int length; - public long hash; - public float quality; - - public long documentMetadata; - - @Nullable - public Integer pubYear; - - @Nullable - public List words; - @Nullable - public TLongList metas; - - public static Hydrator newHydrator() { - return new DocumentDataHydrator(); - } - - public static Dehydrator newDehydrator() { - return DocumentRecord::dehydrate; - } - - public static MessageType schema = new MessageType( - DocumentRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("domain"), - Types.required(BINARY).as(stringType()).named("url"), - Types.required(INT32).named("ordinal"), - Types.required(BINARY).as(stringType()).named("state"), - Types.optional(BINARY).as(stringType()).named("stateReason"), - Types.optional(BINARY).as(stringType()).named("title"), - Types.optional(BINARY).as(stringType()).named("description"), - Types.optional(INT32).named("htmlFeatures"), - Types.optional(BINARY).as(stringType()).named("htmlStandard"), - Types.optional(INT64).named("hash"), - Types.optional(INT64).named("documentMetadata"), - Types.optional(INT32).named("length"), - Types.optional(FLOAT).named("quality"), - Types.optional(INT32).named("pubYear"), - Types.repeated(INT64).named("wordMeta"), - Types.repeated(BINARY).as(stringType()).named("word") - ); - - public DocumentRecord add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "url" -> url = (String) value; - case "ordinal" -> ordinal = (Integer) value; - case "htmlFeatures" -> htmlFeatures = (Integer) value; - case "length" -> length = (Integer) value; - case "pubYear" -> pubYear = (Integer) value; - case "hash" -> hash = (Long) value; - case "documentMetadata" -> documentMetadata = (Long) value; - case "quality" -> quality = (Float) value; - case "state" -> state = (String) value; - case "stateReason" -> stateReason = (String) value; - case "title" -> title = (String) value; - case "description" -> description = (String) value; - case "htmlStandard" -> htmlStandard = (String) value; - case "word" -> { - if (this.words == null) - this.words = new ArrayList<>(100); - this.words.add((String) value); - } - case "wordMeta" -> { - if (this.metas == null) { - this.metas = new TLongArrayList(100); - } - this.metas.add((long) value); - } - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - - public void dehydrate(ValueWriter valueWriter) { - valueWriter.write("domain", domain); - valueWriter.write("url", url); - valueWriter.write("ordinal", ordinal); - valueWriter.write("state", state); - - if (stateReason != null) - valueWriter.write("stateReason", stateReason); - if (title != null) - valueWriter.write("title", title); - if (description != null) - valueWriter.write("description", description); - valueWriter.write("htmlFeatures", htmlFeatures); - valueWriter.write("htmlStandard", htmlStandard); - valueWriter.write("documentMetadata", documentMetadata); - valueWriter.write("length", length); - valueWriter.write("hash", hash); - valueWriter.write("quality", quality); - if (pubYear != null) { - valueWriter.write("pubYear", pubYear); - } - - if (metas != null) { - valueWriter.writeList("wordMeta", metas); - } - - if (words != null) { - valueWriter.writeList("word", words); - } - } - -} - -class DocumentDataHydrator implements Hydrator { - - @Override - public DocumentRecord start() { - return new DocumentRecord(); - } - - @Override - public DocumentRecord add(DocumentRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DocumentRecord finish(DocumentRecord target) { - return target; - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java deleted file mode 100644 index 411fd13c..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java +++ /dev/null @@ -1,84 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Hydrator; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; -import lombok.*; -import org.jetbrains.annotations.NotNull; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DocumentRecordKeywordsProjection { - @NotNull - public String domain; - - public int ordinal; - - public int htmlFeatures; - public long documentMetadata; - - public List words; - public TLongList metas; - - public boolean hasKeywords() { - return words != null && metas != null; - } - - public static Hydrator newHydrator() { - return new DocumentRecordKeywordsProjectionHydrator(); - } - - public static Collection requiredColumns() { - return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata"); - } - - public DocumentRecordKeywordsProjection add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "ordinal" -> ordinal = (Integer) value; - case "htmlFeatures" -> htmlFeatures = (Integer) value; - case "documentMetadata" -> documentMetadata = (Long) value; - case "word" -> { - if (this.words == null) - this.words = new ArrayList<>(100); - this.words.add((String) value); - } - case "wordMeta" -> { - if (this.metas == null) { - this.metas = new TLongArrayList(100); - } - this.metas.add((long) value); - } - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - -} - -class DocumentRecordKeywordsProjectionHydrator implements Hydrator { - - @Override - public DocumentRecordKeywordsProjection start() { - return new DocumentRecordKeywordsProjection(); - } - - @Override - public DocumentRecordKeywordsProjection add(DocumentRecordKeywordsProjection target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DocumentRecordKeywordsProjection finish(DocumentRecordKeywordsProjection target) { - return target; - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java deleted file mode 100644 index ccad52e3..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java +++ /dev/null @@ -1,100 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Hydrator; -import lombok.*; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.Collection; -import java.util.List; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DocumentRecordMetadataProjection { - @NotNull - public String domain; - @NotNull - public String url; - - public int ordinal; - - @NotNull - public String state; - @Nullable - public String stateReason; - - @Nullable - public String title; - @Nullable - public String description; - public int htmlFeatures; - @Nullable - public String htmlStandard; - - public int length; - public long hash; - public float quality; - - public long documentMetadata; - - @Nullable - public Integer pubYear; - - public static Collection requiredColumns() { - return List.of("domain", "url", "ordinal", "htmlFeatures", "length", "pubYear", - "hash", "documentMetadata", "quality", "state", "stateReason", - "title", "description", "htmlStandard"); - } - - public DocumentRecordMetadataProjection add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "url" -> url = (String) value; - case "ordinal" -> ordinal = (Integer) value; - case "htmlFeatures" -> htmlFeatures = (Integer) value; - case "length" -> length = (Integer) value; - case "pubYear" -> pubYear = (Integer) value; - case "hash" -> hash = (Long) value; - case "documentMetadata" -> documentMetadata = (Long) value; - case "quality" -> quality = (Float) value; - case "state" -> state = (String) value; - case "stateReason" -> stateReason = (String) value; - case "title" -> title = (String) value; - case "description" -> description = (String) value; - case "htmlStandard" -> htmlStandard = (String) value; - - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - - public static Hydrator newHydrator() { - return new DocumentRecordMetadataHydrator(); - } - - - -} - -class DocumentRecordMetadataHydrator implements Hydrator { - - @Override - public DocumentRecordMetadataProjection start() { - return new DocumentRecordMetadataProjection(); - } - - @Override - public DocumentRecordMetadataProjection add(DocumentRecordMetadataProjection target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DocumentRecordMetadataProjection finish(DocumentRecordMetadataProjection target) { - return target; - } - -} diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java deleted file mode 100644 index 298d6192..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainLinkRecord.java +++ /dev/null @@ -1,97 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import lombok.*; -import org.apache.parquet.schema.MessageType; -import org.apache.parquet.schema.Types; -import org.jetbrains.annotations.NotNull; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -public class DomainLinkRecord { - @NotNull - public String source; - - @NotNull - public String dest; - - public void dehydrate(ValueWriter valueWriter) { - valueWriter.write("source", source); - valueWriter.write("dest", dest); - } - - public static Dehydrator newDehydrator() { - return DomainLinkRecord::dehydrate; - } - - public static Hydrator newHydrator() { - return new DomainLinkDataHydrator(); - } - public static Hydrator newDestDomainHydrator() { - return new DestDomainNameHydrator(); - } - - public static MessageType schema = new MessageType( - DomainLinkRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("source"), - Types.required(BINARY).as(stringType()).named("dest") - ); - - public DomainLinkRecord add(String heading, Object value) { - switch (heading) { - case "source" -> source = (String) value; - case "dest" -> dest = (String) value; - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - -} - -class DomainLinkDataHydrator implements Hydrator { - - @Override - public DomainLinkRecord start() { - return new DomainLinkRecord(); - } - - @Override - public DomainLinkRecord add(DomainLinkRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DomainLinkRecord finish(DomainLinkRecord target) { - return target; - } - -} - -class DestDomainNameHydrator implements Hydrator { - - @Override - public String start() { - return ""; - } - - @Override - public String add(String target, String heading, Object value) { - if ("dest".equals(heading)) { - return (String) value; - } - return target; - } - - @Override - public String finish(String target) { - return target; - } -} \ No newline at end of file diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java deleted file mode 100644 index b696829f..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainRecord.java +++ /dev/null @@ -1,148 +0,0 @@ -package nu.marginalia.model.processed; - -import blue.strategic.parquet.Dehydrator; -import blue.strategic.parquet.Hydrator; -import blue.strategic.parquet.ValueWriter; -import lombok.*; -import org.apache.parquet.schema.*; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; - -import java.util.ArrayList; -import java.util.List; - -import static org.apache.parquet.schema.LogicalTypeAnnotation.*; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY; -import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.INT32; - -@Getter -@Setter -@NoArgsConstructor -@AllArgsConstructor -@EqualsAndHashCode -@ToString -public class DomainRecord { - @NotNull - public String domain; - - public int knownUrls; - public int goodUrls; - public int visitedUrls; - - @Nullable - public String state; - @Nullable - public String redirectDomain; - @Nullable - public String ip; - - public List rssFeeds; - - - public static Hydrator newHydrator() { - return new DomainHydrator(); - } - - public static Dehydrator newDehydrator() { - return DomainRecord::dehydrate; - } - - public static Hydrator newDomainNameHydrator() { - return new DomainWithIpHydrator(); - } - - - public static MessageType schema = new MessageType( - DomainRecord.class.getSimpleName(), - Types.required(BINARY).as(stringType()).named("domain"), - Types.optional(INT32).named("knownUrls"), - Types.optional(INT32).named("visitedUrls"), - Types.optional(INT32).named("goodUrls"), - Types.required(BINARY).as(stringType()).named("state"), - Types.optional(BINARY).as(stringType()).named("redirectDomain"), - Types.optional(BINARY).as(stringType()).named("ip"), - Types.repeated(BINARY).as(stringType()).named("rss") - ); - - DomainRecord add(String heading, Object value) { - switch (heading) { - case "domain" -> domain = (String) value; - case "knownUrls" -> knownUrls = (Integer) value; - case "visitedUrls" -> visitedUrls = (Integer) value; - case "goodUrls" -> goodUrls = (Integer) value; - case "state" -> state = (String) value; - case "redirectDomain" -> redirectDomain = (String) value; - case "ip" -> ip = (String) value; - case "rss" -> { - if (rssFeeds == null) { - rssFeeds = new ArrayList<>(); - } - rssFeeds.add((String) value); - } - default -> throw new UnsupportedOperationException("Unknown heading '" + heading + '"'); - } - return this; - } - - private void dehydrate(ValueWriter valueWriter) { - valueWriter.write("domain", domain); - valueWriter.write("knownUrls", knownUrls); - valueWriter.write("goodUrls", goodUrls); - valueWriter.write("visitedUrls", visitedUrls); - if (state != null) { - valueWriter.write("state", state); - } - if (redirectDomain != null) { - valueWriter.write("redirectDomain", redirectDomain); - } - if (ip != null) { - valueWriter.write("ip", ip); - } - if (rssFeeds != null) { - valueWriter.writeList("rss", rssFeeds); - } - } - -} - - -class DomainHydrator implements Hydrator { - @Override - public DomainRecord start() { - return new DomainRecord(); - } - - @Override - public DomainRecord add(DomainRecord target, String heading, Object value) { - return target.add(heading, value); - } - - @Override - public DomainRecord finish(DomainRecord target) { - return target; - } -} - -class DomainWithIpHydrator implements Hydrator { - - @Override - public DomainWithIp start() { - return new DomainWithIp(); - } - - @Override - public DomainWithIp add(DomainWithIp target, String heading, Object value) { - if ("domain".equals(heading)) { - target.domain = (String) value; - } - else if ("ip".equals(heading)) { - target.ip = (String) value; - } - return target; - } - - @Override - public DomainWithIp finish(DomainWithIp target) { - return target; - } -} \ No newline at end of file diff --git a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java b/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java deleted file mode 100644 index 3782b1b2..00000000 --- a/code/process-models/processed-data/java/nu/marginalia/model/processed/DomainWithIp.java +++ /dev/null @@ -1,15 +0,0 @@ -package nu.marginalia.model.processed; - -import lombok.AllArgsConstructor; -import lombok.EqualsAndHashCode; -import lombok.NoArgsConstructor; -import lombok.ToString; - -@AllArgsConstructor -@NoArgsConstructor -@EqualsAndHashCode -@ToString -public class DomainWithIp { - public String domain; - public String ip; -} diff --git a/code/process-models/processed-data/readme.md b/code/process-models/processed-data/readme.md deleted file mode 100644 index e7f5cebb..00000000 --- a/code/process-models/processed-data/readme.md +++ /dev/null @@ -1,18 +0,0 @@ -The processed-data package contains models and logic for -reading and writing parquet files with the output from the -[converting-process](../../processes/converting-process). - -Main models: - -* [DocumentRecord](java/nu/marginalia/model/processed/DocumentRecord.java) -* * [DocumentRecordKeywordsProjection](java/nu/marginalia/model/processed/DocumentRecordKeywordsProjection.java) -* * [DocumentRecordMetadataProjection](java/nu/marginalia/model/processed/DocumentRecordMetadataProjection.java) -* [DomainLinkRecord](java/nu/marginalia/model/processed/DomainLinkRecord.java) -* [DomainRecord](java/nu/marginalia/model/processed/DomainRecord.java) - -Since parquet is a column based format, some of the readable models are projections -that only read parts of the input file. - -## See Also - -[third-party/parquet-floor](../../../third-party/parquet-floor) \ No newline at end of file diff --git a/code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java b/code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java deleted file mode 100644 index a358325a..00000000 --- a/code/process-models/processed-data/test/nu/marginalia/io/processed/DocumentRecordParquetFileReaderTest.java +++ /dev/null @@ -1,92 +0,0 @@ -package nu.marginalia.io.processed; - -import gnu.trove.list.array.TLongArrayList; -import nu.marginalia.model.processed.DocumentRecord; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; -import java.util.stream.IntStream; -import java.util.stream.LongStream; - -import static org.junit.jupiter.api.Assertions.*; - -class DocumentRecordParquetFileReaderTest { - Path parquetFile; - - @BeforeEach - public void setUp() throws IOException { - parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); - } - - @AfterEach - public void tearDown() throws IOException { - Files.deleteIfExists(parquetFile); - } - - @Test - public void test() throws IOException { - var doc = new DocumentRecord( - "www.marginalia.nu", - "https://www.marginalia.nu/", - 0, - "OK", - null, - "Itsa me, Marginalia!", - "Hello World", - 3, - "HTML5", - 123, - 0xF00BA3L, - 0.25f, - 4L, - null, - List.of("Hello", "world"), - new TLongArrayList(new long[] { 2, 3}) - ); - - try (var writer = new DocumentRecordParquetFileWriter(parquetFile)) { - writer.write(doc); - } - - var read = DocumentRecordParquetFileReader.stream(parquetFile).toList(); - assertEquals(List.of(doc), read); - } - - @Test - public void testHugePayload() throws IOException { - List words = IntStream.range(0, 100000).mapToObj(Integer::toString).toList(); - TLongArrayList metas = new TLongArrayList(LongStream.range(0, 100000).toArray()); - - var doc = new DocumentRecord( - "www.marginalia.nu", - "https://www.marginalia.nu/", - 0, - "OK", - null, - "Itsa me, Marginalia!", - "Hello World", - 3, - "HTML5", - 123, - 0xF00BA3L, - 0.25f, - 5L, - null, - words, - metas - ); - - try (var writer = new DocumentRecordParquetFileWriter(parquetFile)) { - writer.write(doc); - } - - var read = DocumentRecordParquetFileReader.stream(parquetFile).toList(); - assertEquals(List.of(doc), read); - } - -} \ No newline at end of file diff --git a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java b/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java deleted file mode 100644 index 274e80d0..00000000 --- a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainLinkRecordParquetFileReaderTest.java +++ /dev/null @@ -1,49 +0,0 @@ -package nu.marginalia.io.processed; - -import nu.marginalia.model.processed.DomainLinkRecord; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -class DomainLinkRecordParquetFileReaderTest { - Path parquetFile; - - @BeforeEach - public void setUp() throws IOException { - parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); - } - - @AfterEach - public void tearDown() throws IOException { - Files.deleteIfExists(parquetFile); - } - - @Test - public void testReadFull() throws IOException { - var first = new DomainLinkRecord( - "www.marginalia.nu", - "memex.marginalia.nu"); - var second = new DomainLinkRecord( - "memex.marginalia.nu", - "search.marginalia.nu" - ); - - try (var writer = new DomainLinkRecordParquetFileWriter(parquetFile)) { - writer.write(first); - writer.write(second); - } - - var items = DomainLinkRecordParquetFileReader - .stream(parquetFile) - .toList(); - assertEquals(List.of(first, second), items); - } - -} \ No newline at end of file diff --git a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java b/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java deleted file mode 100644 index b1867100..00000000 --- a/code/process-models/processed-data/test/nu/marginalia/io/processed/DomainRecordParquetFileReaderTest.java +++ /dev/null @@ -1,69 +0,0 @@ -package nu.marginalia.io.processed; - -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.model.processed.DomainWithIp; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.*; - -class DomainRecordParquetFileReaderTest { - Path parquetFile; - - @BeforeEach - public void setUp() throws IOException { - parquetFile = Files.createTempFile(getClass().getSimpleName(), ".parquet"); - } - - @AfterEach - public void tearDown() throws IOException { - Files.deleteIfExists(parquetFile); - } - - @Test - public void testReadFull() throws IOException { - var first = new DomainRecord( - "www.marginalia.nu", - 10, - 3, - 5, - "'sall good man", - null, - "127.0.0.1", - List.of("a", "b") - ); - var second = new DomainRecord( - "memex.marginalia.nu", - 0, - 0, - 0, - "REDIRECT", - "www.marginalia.nu", - "127.0.0.1", - null - ); - - try (var writer = new DomainRecordParquetFileWriter(parquetFile)) { - writer.write(first); - writer.write(second); - } - - var domainInfo = DomainRecordParquetFileReader.getBasicDomainInformation(parquetFile); - assertEquals(List.of( - new DomainWithIp("www.marginalia.nu", "127.0.0.1"), - new DomainWithIp("memex.marginalia.nu", "127.0.0.1")), - domainInfo); - - var items = DomainRecordParquetFileReader - .stream(parquetFile) - .toList(); - assertEquals(List.of(first, second), items); - } - -} \ No newline at end of file diff --git a/code/process-models/work-log/build.gradle b/code/process-models/work-log/build.gradle deleted file mode 100644 index 76fe01f9..00000000 --- a/code/process-models/work-log/build.gradle +++ /dev/null @@ -1,24 +0,0 @@ -plugins { - id 'java' - - - id 'jvm-test-suite' -} - -java { - toolchain { - languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) - } -} -apply from: "$rootProject.projectDir/srcsets.gradle" - -dependencies { - implementation libs.bundles.slf4j - - implementation libs.notnull - - testImplementation libs.bundles.slf4j.test - testImplementation libs.bundles.junit - testImplementation libs.mockito -} - diff --git a/code/processes/converting-process/build.gradle b/code/processes/converting-process/build.gradle index 65ca316a..48c7a878 100644 --- a/code/processes/converting-process/build.gradle +++ b/code/processes/converting-process/build.gradle @@ -25,9 +25,10 @@ dependencies { implementation project(':third-party:porterstemmer') implementation project(':third-party:count-min-sketch') + implementation project(':third-party:parquet-floor') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:common:model') implementation project(':code:common:db') @@ -38,29 +39,24 @@ dependencies { implementation project(':code:libraries:guarded-regex') implementation project(':code:libraries:easy-lsh') + implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:geo-ip') implementation project(':code:libraries:language-processing') - implementation project(':code:process-models:processed-data') - implementation project(':code:process-models:work-log') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:converting-process:model') + implementation project(':code:processes:crawling-process:model') - implementation project(':code:features-convert:adblock') - implementation project(':code:features-convert:anchor-keywords') - implementation project(':code:features-convert:topic-detection') - implementation project(':code:features-convert:pubdate') - implementation project(':code:features-convert:keyword-extraction') - implementation project(':code:features-convert:summary-extraction') - implementation project(':code:features-convert:stackexchange-xml') - implementation project(':code:features-convert:reddit-json') + implementation project(':code:processes:converting-process:ft-anchor-keywords') + implementation project(':code:processes:converting-process:ft-keyword-extraction') - implementation project(':code:features-crawl:crawl-blocklist') - implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-crawl:content-type') + implementation project(':code:processes:crawling-process:ft-crawl-blocklist') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:processes:crawling-process:ft-content-type') testImplementation project(':code:libraries:term-frequency-dict') - testImplementation project(':code:process-models:crawl-spec') + testImplementation project(':code:processes:crawling-process:model') + implementation libs.slop implementation libs.bundles.slf4j implementation libs.notnull @@ -80,6 +76,7 @@ dependencies { implementation libs.bundles.mariadb implementation libs.bundles.nlp + implementation libs.roaringbitmap implementation libs.trove implementation libs.fastutil diff --git a/code/features-convert/anchor-keywords/build.gradle b/code/processes/converting-process/ft-anchor-keywords/build.gradle similarity index 92% rename from code/features-convert/anchor-keywords/build.gradle rename to code/processes/converting-process/ft-anchor-keywords/build.gradle index 1c25bd2e..7572cce0 100644 --- a/code/features-convert/anchor-keywords/build.gradle +++ b/code/processes/converting-process/ft-anchor-keywords/build.gradle @@ -17,7 +17,7 @@ dependencies { implementation project(':code:common:model') implementation project(':code:common:db') implementation project(':code:common:process') - implementation project(':code:features-convert:keyword-extraction') + implementation project(':code:processes:converting-process:ft-keyword-extraction') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java similarity index 51% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java index 95e37836..2e0b6bd7 100644 --- a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java +++ b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/AnchorTextKeywords.java @@ -2,8 +2,11 @@ package nu.marginalia.atags; import com.google.inject.Inject; import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.atags.model.Link; +import nu.marginalia.keyword.LinkTexts; +import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; import java.io.BufferedReader; @@ -12,14 +15,12 @@ import java.io.InputStreamReader; import java.util.*; public class AnchorTextKeywords { - private final KeywordExtractor keywordExtractor; private final SentenceExtractor sentenceExtractor; private final Set stopList; + @Inject - public AnchorTextKeywords(KeywordExtractor keywordExtractor, - SentenceExtractor sentenceExtractor) + public AnchorTextKeywords(SentenceExtractor sentenceExtractor) { - this.keywordExtractor = keywordExtractor; this.sentenceExtractor = sentenceExtractor; stopList = readStoplist(); @@ -29,7 +30,7 @@ public class AnchorTextKeywords { Set ret = new HashSet<>(); try (var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("atags-stop-list"), - "Could not load word frequency table"); + "Could not load anchor tags stop list"); var br = new BufferedReader(new InputStreamReader(resource)) ) { while (true) { @@ -46,29 +47,40 @@ public class AnchorTextKeywords { return ret; } - public Map getAnchorTextKeywords(DomainLinks links, EdgeUrl url) { - var keywordsRaw = links.forUrl(url); + public LinkTexts getAnchorTextKeywords(DomainLinks links, EdgeUrl url) { + List keywordsRaw = links.forUrl(url); + + List ret = new ArrayList<>(keywordsRaw.size()); // Extract and count keywords from anchor text - Map wordsWithCount = new HashMap<>(); - for (var keyword : keywordsRaw) { + for (Link keyword : keywordsRaw) { if (stopList.contains(keyword.text().toLowerCase())) continue; - var sentence = sentenceExtractor.extractSentence(keyword.text()); - for (var wordSpan : keywordExtractor.getKeywordsFromSentence(sentence)) { - wordsWithCount.merge(sentence.constructWordFromSpan(wordSpan), 1, Integer::sum); - } + var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + ret.add(sentence); } - // Filter out keywords that appear infrequently - final Map keywords = new HashMap<>(wordsWithCount.size()); - for (var wordEntry : wordsWithCount.entrySet()) { - if (wordEntry.getValue() > 2) { - keywords.put(wordEntry.getKey(), wordEntry.getValue()); - } + return new LinkTexts(ret); + } + + public LinkTexts getAnchorTextKeywords(DomainLinks links, List urls) { + List keywordsRaw = new ArrayList<>(); + for (var url : urls) { + links.forUrl(url); } - return keywords; + List ret = new ArrayList<>(keywordsRaw.size()); + + // Extract and count keywords from anchor text + for (Link keyword : keywordsRaw) { + if (stopList.contains(keyword.text().toLowerCase())) + continue; + + var sentence = sentenceExtractor.extractSentence(keyword.text(), EnumSet.of(HtmlTag.EXTERNAL_LINKTEXT)); + ret.add(sentence); + } + + return new LinkTexts(ret); } } diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java similarity index 100% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/DomainLinks.java diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/model/Link.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java similarity index 100% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/model/Link.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/Link.java diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java similarity index 100% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/model/LinkWithText.java diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java similarity index 100% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsImpl.java diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSource.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSource.java similarity index 100% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSource.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSource.java diff --git a/code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java b/code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java similarity index 100% rename from code/features-convert/anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java rename to code/processes/converting-process/ft-anchor-keywords/java/nu/marginalia/atags/source/AnchorTagsSourceFactory.java diff --git a/code/features-convert/anchor-keywords/resources/atags-stop-list b/code/processes/converting-process/ft-anchor-keywords/resources/atags-stop-list similarity index 100% rename from code/features-convert/anchor-keywords/resources/atags-stop-list rename to code/processes/converting-process/ft-anchor-keywords/resources/atags-stop-list diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java b/code/processes/converting-process/ft-anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java similarity index 93% rename from code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java rename to code/processes/converting-process/ft-anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java index 17443c51..143759ca 100644 --- a/code/features-convert/anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java +++ b/code/processes/converting-process/ft-anchor-keywords/test/nu/marginalia/atags/DomainAnchorTagsImplTest.java @@ -1,11 +1,9 @@ package nu.marginalia.atags; import nu.marginalia.atags.source.AnchorTagsImpl; -import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.segmentation.NgramLexicon; import nu.marginalia.util.TestLanguageModels; import org.junit.jupiter.api.Test; @@ -39,7 +37,6 @@ class DomainAnchorTagsImplTest { System.out.println(tags.forUrl(new EdgeUrl("http://www.chiark.greenend.org.uk/~sgtatham/putt"))); var atagsKeywords = new AnchorTextKeywords( - new KeywordExtractor(), new SentenceExtractor( TestLanguageModels.getLanguageModels() ) diff --git a/code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java b/code/processes/converting-process/ft-anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java similarity index 100% rename from code/features-convert/anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java rename to code/processes/converting-process/ft-anchor-keywords/test/nu/marginalia/util/TestLanguageModels.java diff --git a/code/features-convert/keyword-extraction/build.gradle b/code/processes/converting-process/ft-keyword-extraction/build.gradle similarity index 87% rename from code/features-convert/keyword-extraction/build.gradle rename to code/processes/converting-process/ft-keyword-extraction/build.gradle index 67da01f4..384d415b 100644 --- a/code/features-convert/keyword-extraction/build.gradle +++ b/code/processes/converting-process/ft-keyword-extraction/build.gradle @@ -14,9 +14,11 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':third-party:porterstemmer') + implementation project(':third-party:parquet-floor') implementation project(':code:common:model') implementation project(':code:common:config') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:term-frequency-dict') implementation libs.bundles.slf4j @@ -24,6 +26,7 @@ dependencies { implementation libs.notnull implementation libs.jsoup + implementation libs.roaringbitmap implementation libs.commons.lang3 implementation libs.guava diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java new file mode 100644 index 00000000..c60cf34c --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/DocumentKeywordExtractor.java @@ -0,0 +1,272 @@ +package nu.marginalia.keyword; + +import com.google.inject.Inject; +import nu.marginalia.WmsaHome; +import nu.marginalia.keyword.extractors.*; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.model.DocumentLanguageData; +import nu.marginalia.language.model.DocumentSentence; +import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.Comparator; +import java.util.List; +import java.util.stream.Stream; + +public class DocumentKeywordExtractor { + + private final KeywordExtractor keywordExtractor; + private final TermFrequencyDict dict; + + + @Inject + public DocumentKeywordExtractor(TermFrequencyDict dict) { + this.dict = dict; + this.keywordExtractor = new KeywordExtractor(); + } + + // for tests + public DocumentKeywordExtractor() { + this.dict = new TermFrequencyDict(WmsaHome.getLanguageModels()); + this.keywordExtractor = new KeywordExtractor(); + } + + + public DocumentKeywordsBuilder extractKeywords(DocumentLanguageData dld, LinkTexts linkTexts, EdgeUrl url) { + + var tfIdfCounts = new WordsTfIdfCounts(dict, keywordExtractor, dld); + + var titleKeywords = new TitleKeywords(keywordExtractor, dld); + var nameLikeKeywords = new NameLikeKeywords(keywordExtractor, dld, 2); + var subjectLikeKeywords = new SubjectLikeKeywords(keywordExtractor, tfIdfCounts, dld); + var artifactKeywords = new ArtifactKeywords(dld); + var urlKeywords = new UrlKeywords(url); + + var keywordMetadata = KeywordMetadata.builder() + .titleKeywords(titleKeywords) + .nameLikeKeywords(nameLikeKeywords) + .subjectLikeKeywords(subjectLikeKeywords) + .urlKeywords(urlKeywords) + .build(); + + DocumentKeywordsBuilder wordsBuilder = new DocumentKeywordsBuilder(); + + createSimpleWords(wordsBuilder, keywordMetadata, dld, linkTexts); + + createNGramTermsFromSet(wordsBuilder, keywordMetadata, titleKeywords); + createNGramTermsFromSet(wordsBuilder, keywordMetadata, subjectLikeKeywords); + createNGramTermsFromSet(wordsBuilder, keywordMetadata, nameLikeKeywords); + + var importantWords = getImportantWords(tfIdfCounts, nameLikeKeywords, subjectLikeKeywords, wordsBuilder); + + wordsBuilder.addImportantWords(importantWords); + wordsBuilder.addAllSyntheticTerms(artifactKeywords.getWords()); + + return wordsBuilder; + } + + private static Collection getImportantWords(WordsTfIdfCounts tfIdfCounts, NameLikeKeywords nameLikeKeywords, SubjectLikeKeywords subjectLikeKeywords, DocumentKeywordsBuilder wordsBuilder) { + return Stream.of(nameLikeKeywords, subjectLikeKeywords) + .flatMap(k -> k.getReps().stream()) + .filter(w -> { + if (w.word.length() < 3) + return false; + if (w.word.contains("_")) + return false; + return true; + }) + .sorted(tfIdfCounts.reversed()) + .limit(16) + .filter(w -> tfIdfCounts.termFrequencyDictValue(w) > 100) + .sorted(Comparator.comparing(tfIdfCounts::termFrequencyDictValue)) + .limit(6) + .map(w -> w.word) + .toList(); + } + + private void createNGramTermsFromSet(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + WordReps words) { + for (var rep : words.getReps()) { + var word = rep.word; + + if (!word.isBlank()) { + byte meta = metadata.getMetadataForWord(rep.stemmed); + wordsBuilder.addMeta(word, meta); + } + } + } + + private void createSimpleWords(DocumentKeywordsBuilder wordsBuilder, + KeywordMetadata metadata, + DocumentLanguageData dld, + LinkTexts linkTexts) + { + // we use 1-based indexing since the data + // will be gamma encoded, and it can't represent 0 + int pos = 0; + + List spanRecorders = new ArrayList<>(); + for (var htmlTag : HtmlTag.includedTags) { + if (!htmlTag.exclude) { + spanRecorders.add(new SpanRecorder(htmlTag)); + } + } + + for (DocumentSentence sent : dld) { + for (var word : sent) { + pos++; + + for (var recorder : spanRecorders) { + recorder.update(sent, pos); + } + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } + } + + for (var names : keywordExtractor.getProperNames(sent)) { + var rep = new WordRep(sent, names); + + byte meta = metadata.getMetadataForWord(rep.stemmed); + + wordsBuilder.addMeta(rep.word, meta); + } + } + + pos++; // we need to add one more position to account for the last word in the document + + for (var recorder : spanRecorders) { + wordsBuilder.addSpans(recorder.finish(pos)); + + // reset the recorder, so we can use it again without adding the same positions twice + recorder.reset(); + } + + // Next add synthetic positions to the document for anchor texts + + pos += 2; // add some padding to the end of the document before we start adding a-tag words + + for (var linkText : linkTexts) { + + for (var word : linkText) { + pos++; + + if (word.isStopWord()) { + continue; + } + + String w = word.wordLowerCase(); + if (matchesWordPattern(w)) { + /* Add information about term positions */ + wordsBuilder.addPos(w, pos); + + /* Add metadata for word */ + wordsBuilder.addMeta(w, metadata.getMetadataForWord(word.stemmed())); + } + } + + // add some padding between separate link texts so we don't match across their boundaries + pos+=2; + } + + for (var recorder : spanRecorders) { + wordsBuilder.addSpans(recorder.finish(pos)); + } + } + + boolean matchesWordPattern(String s) { + // this function is an unrolled version of the regexp [\da-zA-Z]{1,15}([.\-_/:+*][\da-zA-Z]{1,10}){0,4} + + String wordPartSeparator = ".-_/:+*"; + + int i = 0; + + for (int run = 0; run < 15 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + + if (i == 0) + return false; + + for (int j = 0; j < 5; j++) { + if (i == s.length()) return true; + + if (wordPartSeparator.indexOf(s.charAt(i)) < 0) { + return false; + } + + i++; + + for (int run = 0; run < 10 && i < s.length(); run++, i++) { + char c = s.charAt(i); + if (c >= 'a' && c <= 'z') continue; + if (c >= 'A' && c <= 'Z') continue; + if (c >= '0' && c <= '9') continue; + break; + } + } + + return false; + } + + /** Helper class to record spans of words */ + private static class SpanRecorder { + private List spans = new ArrayList<>(); + private final HtmlTag htmlTag; + private int start = 0; + + public SpanRecorder(HtmlTag htmlTag) { + this.htmlTag = htmlTag; + } + + public void update(DocumentSentence sentence, int pos) { + assert pos > 0; + + if ( + sentence.htmlTags.contains(htmlTag) + || (sentence.htmlTags.isEmpty() && htmlTag == HtmlTag.BODY) // special case for body tag, we match against no tag on the sentence + ) + { + if (start <= 0) start = pos; + } + else { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, pos)); + start = 0; + } + } + } + + public List finish(int length) { + if (start > 0) { + spans.add(new DocumentKeywordsBuilder.DocumentWordSpan(htmlTag, start, length)); + start = 0; + } + return spans; + } + + public void reset() { + spans.clear(); + start = 0; + } + } +} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java similarity index 84% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java index e1990618..babd44d7 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordExtractor.java @@ -3,7 +3,6 @@ package nu.marginalia.keyword; import nu.marginalia.language.WordPatterns; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.model.WordSeparator; import java.lang.ref.SoftReference; import java.util.ArrayList; @@ -20,15 +19,15 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isProperNoun(i, sentence) && isProperNoun(i-1, sentence)) spans.add(new WordSpan(i-1, i+1)); } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } if (isProperNoun(i, sentence) && (isJoiner(sentence, i-1) || isProperNoun(i-1, sentence)) @@ -37,9 +36,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } if (isProperNoun(i, sentence) && isProperNoun(i-3, sentence)) { if (isProperNoun(i - 1, sentence) && isProperNoun(i - 2, sentence)) @@ -66,7 +65,7 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isNoun(i, sentence) && (isNoun(i-1, sentence)) || "JJ".equals(sentence.posTags[i-1])) { @@ -75,8 +74,8 @@ public class KeywordExtractor { } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } if ((isNoun(i, sentence)) && (isJoiner(sentence, i-1) || isNoun(i-1, sentence)) @@ -85,9 +84,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } if (isNoun(i, sentence) && (isNoun(i-3, sentence) || "JJ".equals(sentence.posTags[i-3]))) { if (isNoun(i - 1, sentence) && isNoun(i - 2, sentence)) @@ -119,7 +118,7 @@ public class KeywordExtractor { } for (int i = 1; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { continue; } if (isName(i, sentence)) { if (isName(i - 1, sentence) || isTopAdj(i-1, sentence)) @@ -131,8 +130,8 @@ public class KeywordExtractor { } for (int i = 2; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { i++; continue; } + if (sentence.isSeparatorComma(i-2)) { continue; } if (isName(i, sentence)) { if ((isName(i-1, sentence) || isTopAdj(i-1, sentence)) @@ -149,9 +148,9 @@ public class KeywordExtractor { } for (int i = 3; i < sentence.length(); i++) { - if (sentence.separators[i-1] == WordSeparator.COMMA) { i+=2; continue; } - if (sentence.separators[i-2] == WordSeparator.COMMA) { i++; continue; } - if (sentence.separators[i-3] == WordSeparator.COMMA) { continue; } + if (sentence.isSeparatorComma(i-1)) { i+=2; continue; } + if (sentence.isSeparatorComma(i-2)) { i++; continue; } + if (sentence.isSeparatorComma(i-3)) { continue; } if (isName(i, sentence) && (isName(i-1, sentence) || isTopAdj(i-1, sentence)) && @@ -217,7 +216,7 @@ public class KeywordExtractor { private boolean isViableSpanForWord(DocumentSentence sentence, WordSpan w) { for (int i = w.start; i < w.end-1; i++) { - if (sentence.separators[i] == WordSeparator.COMMA) { + if (sentence.isSeparatorComma(i)) { return false; } } diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java new file mode 100644 index 00000000..b27e0676 --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/KeywordMetadata.java @@ -0,0 +1,57 @@ +package nu.marginalia.keyword; + +import lombok.Builder; +import nu.marginalia.keyword.extractors.NameLikeKeywords; +import nu.marginalia.keyword.extractors.SubjectLikeKeywords; +import nu.marginalia.keyword.extractors.TitleKeywords; +import nu.marginalia.keyword.extractors.UrlKeywords; +import nu.marginalia.model.idx.WordFlags; + +class KeywordMetadata { + + private final TitleKeywords titleKeywords; + private final NameLikeKeywords nameLikeKeywords; + private final SubjectLikeKeywords subjectLikeKeywords; + private final UrlKeywords urlKeywords; + + @Builder + public KeywordMetadata( + TitleKeywords titleKeywords, + NameLikeKeywords nameLikeKeywords, + SubjectLikeKeywords subjectLikeKeywords, + UrlKeywords urlKeywords) + { + this.titleKeywords = titleKeywords; + this.nameLikeKeywords = nameLikeKeywords; + this.subjectLikeKeywords = subjectLikeKeywords; + this.urlKeywords = urlKeywords; + } + + public byte getMetadataForWord(String stemmed) { + + byte flags = 0; + + if (subjectLikeKeywords.contains(stemmed)) { + flags |= WordFlags.Subjects.asBit(); + } + + if (nameLikeKeywords.contains(stemmed)) { + flags |= WordFlags.NamesWords.asBit(); + } + + if (titleKeywords.contains(stemmed)) { + flags |= WordFlags.Title.asBit(); + } + + if (urlKeywords.containsUrl(stemmed)) { + flags |= WordFlags.UrlPath.asBit(); + } + + if (urlKeywords.containsDomain(stemmed)) { + flags |= WordFlags.UrlDomain.asBit(); + } + + return flags; + } + +} diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java new file mode 100644 index 00000000..c1ade6b4 --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/LinkTexts.java @@ -0,0 +1,19 @@ +package nu.marginalia.keyword; + +import nu.marginalia.language.model.DocumentSentence; +import org.jetbrains.annotations.NotNull; + +import java.util.Iterator; +import java.util.List; + +public record LinkTexts(List linkTexts) implements Iterable { + public LinkTexts() { + this(List.of()); + } + + @NotNull + @Override + public Iterator iterator() { + return linkTexts.iterator(); + } +} diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/WordReps.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/WordReps.java similarity index 100% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/WordReps.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/WordReps.java diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java similarity index 88% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java index fd66bed2..d8341731 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/ArtifactKeywords.java @@ -2,7 +2,9 @@ package nu.marginalia.keyword.extractors; import nu.marginalia.language.model.DocumentLanguageData; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; import java.util.regex.Pattern; public class ArtifactKeywords { @@ -16,9 +18,8 @@ public class ArtifactKeywords { public ArtifactKeywords(DocumentLanguageData documentLanguageData) { - for (var sent : documentLanguageData.sentences) { - for (var word : sent) { - final String lc = word.wordLowerCase(); + for (var sent : documentLanguageData) { + for (String lc : sent.wordsLowerCase) { final int atIdx = lc.indexOf('@'); if (lc.length() < 6 || atIdx < 0 || !mailLikePattern.matcher(lc).matches()) { diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java similarity index 81% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java index c033bdc1..3e5c67fe 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/NameLikeKeywords.java @@ -1,13 +1,12 @@ package nu.marginalia.keyword.extractors; -import com.google.common.base.CharMatcher; import it.unimi.dsi.fastutil.objects.Object2IntMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.DocumentSentence; import nu.marginalia.language.model.WordRep; -import nu.marginalia.keyword.KeywordExtractor; import java.util.*; import java.util.stream.Collectors; @@ -18,16 +17,13 @@ public class NameLikeKeywords implements WordReps { private final Set stemmed; public NameLikeKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData dld, int minCount) { - Object2IntOpenHashMap counts = new Object2IntOpenHashMap<>(1000); - HashMap> instances = new HashMap<>(1000); + var counts = new Object2IntOpenHashMap(100); + var instances = new HashMap>(100); - final var isUpperCase = CharMatcher.forPredicate(Character::isUpperCase); - - for (int i = 0; i < dld.sentences.length; i++) { - DocumentSentence sent = dld.sentences[i]; + for (DocumentSentence sent : dld) { var keywords = keywordExtractor.getProperNames(sent); for (var span : keywords) { - if (span.size() <= 1 && isUpperCase.matchesAllOf(sent.words[span.start])) + if (span.size() <= 1 && sent.isAllCaps(span.start)) continue; var stemmed = sent.constructStemmedWordFromSpan(span); diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java similarity index 93% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java index d4a6e428..1d88b5c1 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/SubjectLikeKeywords.java @@ -6,7 +6,6 @@ import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; import nu.marginalia.language.model.WordSpan; -import nu.marginalia.language.model.WordSeparator; import org.apache.commons.lang3.StringUtils; import java.util.*; @@ -30,14 +29,13 @@ public class SubjectLikeKeywords implements WordReps { Map> instances = new HashMap<>(); - for (var sentence : dld.sentences) { + for (var sentence : dld) { for (WordSpan kw : keywordExtractor.getNouns(sentence)) { if (kw.end + 2 >= sentence.length()) { continue; } - if (sentence.separators[kw.end] == WordSeparator.COMMA - || sentence.separators[kw.end + 1] == WordSeparator.COMMA) + if (sentence.isSeparatorComma(kw.end) || sentence.isSeparatorComma(kw.end + 1)) continue; String nextTag = sentence.posTags[kw.end]; diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java similarity index 86% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java index e1c7eceb..846225c2 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/TitleKeywords.java @@ -1,11 +1,11 @@ package nu.marginalia.keyword.extractors; -import nu.marginalia.keyword.WordReps; import nu.marginalia.keyword.KeywordExtractor; +import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; +import nu.marginalia.language.sentence.tag.HtmlTag; -import java.util.Arrays; import java.util.Collection; import java.util.Set; import java.util.stream.Collectors; @@ -16,7 +16,8 @@ public class TitleKeywords implements WordReps { private final Set stemmed; public TitleKeywords(KeywordExtractor keywordExtractor, DocumentLanguageData documentLanguageData) { - titleKeywords = Arrays.stream(documentLanguageData.titleSentences).flatMap(sent -> + titleKeywords = documentLanguageData.findSentencesForTag(HtmlTag.TITLE).stream() + .flatMap(sent -> keywordExtractor.getWordsFromSentence(sent).stream().sorted().distinct().map(w -> new WordRep(sent, w))) .limit(100) .collect(Collectors.toSet()); diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/UrlKeywords.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/UrlKeywords.java similarity index 100% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/UrlKeywords.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/UrlKeywords.java diff --git a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java similarity index 95% rename from code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java rename to code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java index 8904e16e..62ae5f6a 100644 --- a/code/features-convert/keyword-extraction/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/extractors/WordsTfIdfCounts.java @@ -1,14 +1,17 @@ package nu.marginalia.keyword.extractors; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; +import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.keyword.WordReps; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.model.WordRep; -import nu.marginalia.keyword.KeywordExtractor; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.apache.commons.lang3.StringUtils; -import java.util.*; +import java.util.Collection; +import java.util.Comparator; +import java.util.HashSet; +import java.util.Set; import static java.lang.Math.max; @@ -46,7 +49,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator { // Collect words with a high TF-IDF so that they can be marked with a bit flag tfIdfHigh = new HashSet<>(100); - for (var sent : dld.sentences) { + for (var sent : dld) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { if (highTfIdfInstances.contains(sent.constructStemmedWordFromSpan(span))) { @@ -61,7 +64,7 @@ public class WordsTfIdfCounts implements WordReps, Comparator { Object2IntOpenHashMap counts = new Object2IntOpenHashMap<>(10_000, 0.7f); counts.defaultReturnValue(0); - for (var sent : dld.sentences) { + for (var sent : dld) { var keywords = keywordExtractor.getKeywordsFromSentence(sent); for (var span : keywords) { counts.addTo(sent.constructStemmedWordFromSpan(span), 1); diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java new file mode 100644 index 00000000..7beede50 --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywords.java @@ -0,0 +1,38 @@ +package nu.marginalia.keyword.model; + +import nu.marginalia.model.idx.CodedWordSpan; +import nu.marginalia.sequence.VarintCodedSequence; + +import java.util.List; + +public final class DocumentKeywords { + + public final List keywords; + public final byte[] metadata; + public final List positions; + public final List spans; + + public DocumentKeywords(List keywords, + byte[] metadata, + List positions, + List spans) + { + this.keywords = keywords; + this.metadata = metadata; + this.positions = positions; + this.spans = spans; + + assert keywords.size() == metadata.length; + } + + public boolean isEmpty() { + return keywords.isEmpty(); + } + + public int size() { + return keywords.size(); + } + +} + + diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java new file mode 100644 index 00000000..bae5ac7c --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -0,0 +1,164 @@ +package nu.marginalia.keyword.model; + +import gnu.trove.list.array.TByteArrayList; +import it.unimi.dsi.fastutil.ints.IntArrayList; +import it.unimi.dsi.fastutil.ints.IntList; +import it.unimi.dsi.fastutil.objects.Object2ByteOpenHashMap; +import lombok.Getter; +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.model.idx.CodedWordSpan; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.sequence.VarintCodedSequence; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.ByteBuffer; +import java.util.*; + +@Getter +public class DocumentKeywordsBuilder { + public final Object2ByteOpenHashMap wordToMeta; + public final HashMap wordToPos; + public final Map> wordSpans = new HashMap<>(); + + /** These ware keywords that had signals of high relevance */ + public final Set importantWords = new HashSet<>(); + + // |------64 letters is this long-------------------------------| + // granted, some of these words are word n-grams, but 64 ought to + // be plenty. The lexicon writer has another limit that's higher. + private final int MAX_WORD_LENGTH = 64; + private final int MAX_POSITIONS_PER_WORD = 512; + + private static final Logger logger = LoggerFactory.getLogger(DocumentKeywordsBuilder.class); + + public DocumentKeywordsBuilder() { + this(1600); + } + + public DocumentKeywords build(ByteBuffer workArea) { + final List wordArray = new ArrayList<>(wordToMeta.size()); + final TByteArrayList meta = new TByteArrayList(wordToMeta.size()); + final List positions = new ArrayList<>(wordToMeta.size()); + + var iter = wordToMeta.object2ByteEntrySet().fastIterator(); + + while (iter.hasNext()) { + var entry = iter.next(); + + meta.add(entry.getByteValue()); + wordArray.add(entry.getKey()); + + IntList posList = wordToPos.getOrDefault(entry.getKey(), IntList.of()); + + if (posList.size() > MAX_POSITIONS_PER_WORD) { + posList.subList(MAX_POSITIONS_PER_WORD, posList.size()).clear(); + } + + positions.add(VarintCodedSequence.generate(posList)); + } + + // Encode spans + List spans = new ArrayList<>(wordSpans.size()); + + wordSpans.forEach((tag, spansForTag) -> { + spansForTag.sort(Comparator.comparingInt(DocumentWordSpan::start)); + + var positionsForTag = new IntArrayList(spansForTag.size()*2); + for (var span : spansForTag) { + positionsForTag.add(span.start()); + positionsForTag.add(span.end()); + } + + spans.add(new CodedWordSpan((byte) tag.charValue(), VarintCodedSequence.generate(positionsForTag))); + }); + + return new DocumentKeywords(wordArray, meta.toArray(), positions, spans); + } + + public DocumentKeywordsBuilder(int capacity) { + wordToMeta = new Object2ByteOpenHashMap<>(capacity); + wordToPos = new HashMap<>(capacity); + } + + public void addMeta(String word, byte meta) { + if (word.length() > MAX_WORD_LENGTH) + return; + + wordToMeta.put(word, meta); + } + + public void addPos(String word, int pos) { + if (word.length() > MAX_WORD_LENGTH) + return; + + wordToPos.computeIfAbsent(word, k -> new IntArrayList()).add(pos); + } + + public void addImportantWords(Collection words) { + importantWords.addAll(words); + } + + public void setFlagOnMetadataForWords(WordFlags flag, Collection flagWords) { + flagWords.forEach(word -> + wordToMeta.mergeByte(word, flag.asBit(), (a, b) -> (byte)(a|b)) + ); + } + + public void addAllSyntheticTerms(Collection newWords) { + byte meta = WordFlags.Synthetic.asBit(); + + // Only add the synthetic flag if the words aren't already present + + newWords.forEach(word -> wordToMeta.putIfAbsent(word, meta)); + } + + public List getWordsWithAnyFlag(long flags) { + List ret = new ArrayList<>(); + + for (var iter = wordToMeta.object2ByteEntrySet().fastIterator(); iter.hasNext();) { + var entry = iter.next(); + if ((flags & entry.getByteValue()) != 0) { + ret.add(entry.getKey()); + } + } + + return ret; + } + + public void addSpans(List newSpans) { + for (var span : newSpans) { + wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span); + } + } + + public int size() { + return Math.max(wordToMeta.size(), wordToPos.size()); + } + + + @Override + public String toString() { + StringBuilder sb = new StringBuilder("[ "); + + wordToMeta.forEach((word, meta) -> { + sb.append(word) + .append("->") + .append(WordFlags.decode(meta)) + .append(',') + .append(wordToPos.getOrDefault(word, new IntArrayList())) + .append(' '); + }); + + wordSpans.forEach((tag, spans) -> { + sb.append(tag) + .append("->") + .append(spans) + .append(' '); + }); + return sb.append(']').toString(); + } + + public record DocumentWordSpan(HtmlTag tag, int start, int end) { + } +} diff --git a/code/features-convert/keyword-extraction/readme.md b/code/processes/converting-process/ft-keyword-extraction/readme.md similarity index 82% rename from code/features-convert/keyword-extraction/readme.md rename to code/processes/converting-process/ft-keyword-extraction/readme.md index a9c04962..f961d3d0 100644 --- a/code/features-convert/keyword-extraction/readme.md +++ b/code/processes/converting-process/ft-keyword-extraction/readme.md @@ -11,4 +11,4 @@ functions based on [POS tags](https://www.ling.upenn.edu/courses/Fall_2003/ling0 ## See Also -* [libraries/language-processing](../../libraries/language-processing) does a lot of the heavy lifting. \ No newline at end of file +* [libraries/language-processing](../../../libraries/language-processing) does a lot of the heavy lifting. \ No newline at end of file diff --git a/code/features-convert/keyword-extraction/test-resources/test-data/java.html b/code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/java.html similarity index 100% rename from code/features-convert/keyword-extraction/test-resources/test-data/java.html rename to code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/java.html diff --git a/code/features-convert/keyword-extraction/test-resources/test-data/keyboards.html b/code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/keyboards.html similarity index 100% rename from code/features-convert/keyword-extraction/test-resources/test-data/keyboards.html rename to code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/keyboards.html diff --git a/code/features-convert/keyword-extraction/test-resources/test-data/madonna.html b/code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/madonna.html similarity index 100% rename from code/features-convert/keyword-extraction/test-resources/test-data/madonna.html rename to code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/madonna.html diff --git a/code/features-convert/keyword-extraction/test-resources/test-data/spam.html b/code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/spam.html similarity index 100% rename from code/features-convert/keyword-extraction/test-resources/test-data/spam.html rename to code/processes/converting-process/ft-keyword-extraction/test-resources/test-data/spam.html diff --git a/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java new file mode 100644 index 00000000..83996e41 --- /dev/null +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/DocumentKeywordExtractorTest.java @@ -0,0 +1,112 @@ +package nu.marginalia.keyword; + +import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; +import org.jsoup.Jsoup; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.net.URISyntaxException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +class DocumentKeywordExtractorTest { + + static DocumentKeywordExtractor extractor = new DocumentKeywordExtractor(); + static SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + @Test + public void testWordPattern() { + Assertions.assertTrue(extractor.matchesWordPattern("test")); + Assertions.assertTrue(extractor.matchesWordPattern("1234567890abcde")); + Assertions.assertFalse(extractor.matchesWordPattern("1234567890abcdef")); + + Assertions.assertTrue(extractor.matchesWordPattern("test-test-test-test-test")); + Assertions.assertFalse(extractor.matchesWordPattern("test-test-test-test-test-test")); + Assertions.assertTrue(extractor.matchesWordPattern("192.168.1.100/24")); + Assertions.assertTrue(extractor.matchesWordPattern("std::vector")); + Assertions.assertTrue(extractor.matchesWordPattern("c++")); + Assertions.assertTrue(extractor.matchesWordPattern("m*a*s*h")); + Assertions.assertFalse(extractor.matchesWordPattern("Stulpnagelstrasse")); + } + + @Test + public void testKeyboards2() throws IOException, URISyntaxException { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/keyboards.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0.5)); + + var keywords = extractor.extractKeywords(se.extractSentences(doc), new LinkTexts(), new EdgeUrl("https://pmortensen.eu/world2/2021/12/24/rapoo-mechanical-keyboards-gotchas-and-setup/")); + + keywords.getWordToMeta().forEach((k, v) -> { + if (k.contains("_")) { + System.out.println(k + " " + WordFlags.decode(v)); + } + }); + } + + + @Test + public void testMadonna() throws IOException, URISyntaxException { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/madonna.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0.5)); + + var keywords = extractor.extractKeywords( + se.extractSentences(doc), + new LinkTexts(), new EdgeUrl("https://encyclopedia.marginalia.nu/article/Don't_Tell_Me_(Madonna_song)") + ); + + var keywordsBuilt = keywords.build(ByteBuffer.allocate(1024)); + + Map flags = new HashMap<>(); + Map positions = new HashMap<>(); + + for (int i = 0; i < keywordsBuilt.size(); i++) { + String keyword = keywordsBuilt.keywords.get(i); + byte metadata = keywordsBuilt.metadata[i] + ; + + if (Set.of("dirty", "blues").contains(keyword)) { + flags.put(keyword, metadata); + positions.put(keyword, keywordsBuilt.positions.get(i)); + + } + } + + Assertions.assertTrue(flags.containsKey("dirty")); + Assertions.assertTrue(flags.containsKey("blues")); + Assertions.assertNotEquals( + positions.get("dirty"), + positions.get("blues") + ); + } + + @Test + public void testSpam() throws IOException, URISyntaxException { + var resource = Objects.requireNonNull(ClassLoader.getSystemResourceAsStream("test-data/spam.html"), + "Could not load word frequency table"); + String html = new String(resource.readAllBytes(), Charset.defaultCharset()); + var doc = Jsoup.parse(html); + doc.filter(new DomPruningFilter(0.5)); + + DocumentKeywordExtractor extractor = new DocumentKeywordExtractor( + new TermFrequencyDict(WmsaHome.getLanguageModels())); + SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + + } +} \ No newline at end of file diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java similarity index 89% rename from code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java index bfc78a9c..4efa274d 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/SentenceExtractorTest.java @@ -2,11 +2,11 @@ package nu.marginalia.keyword; import lombok.SneakyThrows; import nu.marginalia.LanguageModels; -import nu.marginalia.language.sentence.SentenceExtractor; -import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.WmsaHome; +import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.language.sentence.tag.HtmlTag; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.test.util.TestLanguageModels; import org.jsoup.Jsoup; import org.junit.jupiter.api.Tag; @@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; class SentenceExtractorTest { static final LanguageModels lm = TestLanguageModels.getLanguageModels(); - static NgramLexicon ngramLexicon = new NgramLexicon(lm); static SentenceExtractor se = new SentenceExtractor(lm); @SneakyThrows @@ -36,7 +35,7 @@ class SentenceExtractorTest { var dict = new TermFrequencyDict(lm); var url = new EdgeUrl("https://memex.marginalia.nu/"); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict, ngramLexicon); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(dict); for (;;) { long total = 0; @@ -44,7 +43,7 @@ class SentenceExtractorTest { var doc = Jsoup.parse(Files.readString(file.toPath())); long start = System.currentTimeMillis(); var dld = se.extractSentences(doc); - documentKeywordExtractor.extractKeywords(dld, url); + documentKeywordExtractor.extractKeywords(dld, new LinkTexts(), url); total += (System.currentTimeMillis() - start); } System.out.println(total); @@ -60,8 +59,8 @@ class SentenceExtractorTest { @Test public void testACDC() { - var ret = se.extractSentence("AC/DC is a rock band."); - assertEquals("AC/DC", ret.words[0]); + var ret = se.extractSentence("AC/DC is a rock band.", EnumSet.noneOf(HtmlTag.class)); + assertEquals("ac/dc", ret.wordsLowerCase[0]); } final Pattern p = Pattern.compile("([, ]+)"); diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java similarity index 100% rename from code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/ArtifactKeywordsTest.java diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java similarity index 100% rename from code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/NameLikeKeywordsTest.java diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java similarity index 100% rename from code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/SubjectLikeKeywordsTest.java diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java similarity index 98% rename from code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java index cac29c73..49a555de 100644 --- a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java +++ b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/TitleKeywordsTest.java @@ -190,7 +190,9 @@ class TitleKeywordsTest { public void extractTitleWords() { var se = new SentenceExtractor(TestLanguageModels.getLanguageModels()); - var reps = new TitleKeywords(new KeywordExtractor(), se.extractSentences(Jsoup.parse(document))).getReps(); + var dld = se.extractSentences(Jsoup.parse(document)); + + var reps = new TitleKeywords(new KeywordExtractor(), dld).getReps(); var words = reps.stream().map(rep -> rep.word).collect(Collectors.toSet()); Set expected = Set.of( diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/UrlKeywordsTest.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/UrlKeywordsTest.java similarity index 100% rename from code/features-convert/keyword-extraction/test/nu/marginalia/keyword/extractors/UrlKeywordsTest.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/keyword/extractors/UrlKeywordsTest.java diff --git a/code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java b/code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java similarity index 100% rename from code/features-convert/keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java rename to code/processes/converting-process/ft-keyword-extraction/test/nu/marginalia/test/util/TestLanguageModels.java diff --git a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java index 16d9bc40..c45967bc 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/ConverterMain.java @@ -14,8 +14,8 @@ import nu.marginalia.converting.sideload.SideloadSourceFactory; import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; import nu.marginalia.converting.writer.ConverterWriter; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java b/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java index 11c329eb..34c6836d 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/model/DisqualifiedException.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.model; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; public class DisqualifiedException extends Exception { public final DisqualificationReason reason; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java index d097c60a..f75c35ad 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/AcceptableAds.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.processor; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDocument; import org.jsoup.nodes.Document; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java index 02e22f4f..2a4fbcb1 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentDecorator.java @@ -1,7 +1,5 @@ package nu.marginalia.converting.processor; -import nu.marginalia.atags.AnchorTextKeywords; -import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.converting.model.ProcessedDocument; import java.util.HashSet; @@ -9,23 +7,20 @@ import java.util.Set; public class DocumentDecorator { private final Set extraSearchTerms = new HashSet<>(); - private final AnchorTextKeywords keywords; - public DocumentDecorator(AnchorTextKeywords keywords) { - this.keywords = keywords; + public DocumentDecorator() { } public void addTerm(String term) { extraSearchTerms.add(term); } - public void apply(ProcessedDocument doc, DomainLinks externalDomainLinks) { + public void apply(ProcessedDocument doc) { if (doc == null) return; if (doc.words == null) return; doc.words.addAllSyntheticTerms(extraSearchTerms); - doc.words.addAnchorTerms(keywords.getAnchorTextKeywords(externalDomainLinks, doc.url)); } } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java index 96392920..36eae72a 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DocumentProcessor.java @@ -1,23 +1,28 @@ package nu.marginalia.converting.processor; import com.google.inject.Inject; +import nu.marginalia.atags.AnchorTextKeywords; import nu.marginalia.atags.model.DomainLinks; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.crawl.UrlIndexingState; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.plugin.AbstractDocumentProcessorPlugin; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; import nu.marginalia.converting.processor.plugin.PlainTextDocumentProcessorPlugin; +import nu.marginalia.keyword.LinkTexts; +import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URISyntaxException; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Objects; +import java.util.Set; public class DocumentProcessor { @@ -30,11 +35,14 @@ public class DocumentProcessor { private final List processorPlugins = new ArrayList<>(); + private final AnchorTextKeywords anchorTextKeywords; @Inject public DocumentProcessor(HtmlDocumentProcessorPlugin htmlDocumentProcessorPlugin, - PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin) + PlainTextDocumentProcessorPlugin plainTextDocumentProcessorPlugin, + AnchorTextKeywords anchorTextKeywords) { + this.anchorTextKeywords = anchorTextKeywords; processorPlugins.add(htmlDocumentProcessorPlugin); processorPlugins.add(plainTextDocumentProcessorPlugin); @@ -78,7 +86,12 @@ public class DocumentProcessor { return ret; } - private void processDocument(CrawledDocument crawledDocument, DocumentClass documentClass, DocumentDecorator documentDecorator, DomainLinks externalDomainLinks, ProcessedDocument ret) throws URISyntaxException, DisqualifiedException { + private void processDocument(CrawledDocument crawledDocument, + DocumentClass documentClass, + DocumentDecorator documentDecorator, + DomainLinks externalDomainLinks, + ProcessedDocument ret) throws URISyntaxException, DisqualifiedException + { var crawlerStatus = CrawlerDocumentStatus.valueOf(crawledDocument.crawlerStatus); if (crawlerStatus != CrawlerDocumentStatus.OK) { @@ -97,12 +110,15 @@ public class DocumentProcessor { final var plugin = findPlugin(crawledDocument); - AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument, documentClass); + EdgeUrl url = new EdgeUrl(crawledDocument.url); + LinkTexts linkTexts = anchorTextKeywords.getAnchorTextKeywords(externalDomainLinks, url); + + AbstractDocumentProcessorPlugin.DetailsWithWords detailsWithWords = plugin.createDetails(crawledDocument, linkTexts, documentClass); ret.details = detailsWithWords.details(); ret.words = detailsWithWords.words(); - documentDecorator.apply(ret, externalDomainLinks); + documentDecorator.apply(ret); if (Boolean.TRUE.equals(crawledDocument.hasCookies) && ret.details != null diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java index 7ec0bf29..0328709c 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/DomainProcessor.java @@ -7,19 +7,21 @@ import nu.marginalia.atags.model.DomainLinks; import nu.marginalia.atags.source.AnchorTagsSource; import nu.marginalia.atags.source.AnchorTagsSourceFactory; import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; import nu.marginalia.converting.processor.logic.links.LinkGraph; +import nu.marginalia.converting.processor.logic.links.TopKeywords; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.writer.ConverterBatchWritableIf; import nu.marginalia.converting.writer.ConverterBatchWriter; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.*; import nu.marginalia.geoip.GeoIpDictionary; import nu.marginalia.geoip.sources.AsnTable; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.converting.model.ProcessedDomain; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.converting.processor.logic.links.TopKeywords; -import nu.marginalia.converting.processor.logic.LshDocumentDeduplicator; +import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import nu.marginalia.util.ProcessingIterator; import org.apache.commons.lang3.StringUtils; import org.jetbrains.annotations.Nullable; @@ -102,7 +104,7 @@ public class DomainProcessor { domain = new ProcessedDomain(); domain.sizeloadSizeAdvice = sizeHint == 0 ? 10_000 : sizeHint; - documentDecorator = new DocumentDecorator(anchorTextKeywords); + documentDecorator = new DocumentDecorator(); processDomain(crawledDomain, domain, documentDecorator); @@ -177,7 +179,7 @@ public class DomainProcessor { } DomainLinks externalDomainLinks = anchorTagsSource.getAnchorTags(crawledDomain.getDomain()); - DocumentDecorator documentDecorator = new DocumentDecorator(anchorTextKeywords); + DocumentDecorator documentDecorator = new DocumentDecorator(); // Process Domain Record diff --git a/code/features-convert/adblock/java/nu/marginalia/adblock/AdblockSimulator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java similarity index 98% rename from code/features-convert/adblock/java/nu/marginalia/adblock/AdblockSimulator.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java index 1908fda3..74eecdd0 100644 --- a/code/features-convert/adblock/java/nu/marginalia/adblock/AdblockSimulator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/AdblockSimulator.java @@ -1,4 +1,4 @@ -package nu.marginalia.adblock; +package nu.marginalia.converting.processor.classifier.adblock; import com.google.inject.Inject; import com.google.inject.Singleton; diff --git a/code/features-convert/adblock/java/nu/marginalia/adblock/GoogleAnwersSpamDetector.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/GoogleAnwersSpamDetector.java similarity index 93% rename from code/features-convert/adblock/java/nu/marginalia/adblock/GoogleAnwersSpamDetector.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/GoogleAnwersSpamDetector.java index 4cec3700..8c554c15 100644 --- a/code/features-convert/adblock/java/nu/marginalia/adblock/GoogleAnwersSpamDetector.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/adblock/GoogleAnwersSpamDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.adblock; +package nu.marginalia.converting.processor.classifier.adblock; import org.jsoup.nodes.Document; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/AdHocDetector.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/AdHocDetector.java new file mode 100644 index 00000000..2e52c865 --- /dev/null +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/AdHocDetector.java @@ -0,0 +1,53 @@ +package nu.marginalia.converting.processor.classifier.topic; + +import ca.rmen.porterstemmer.PorterStemmer; +import nu.marginalia.language.model.DocumentLanguageData; +import org.apache.commons.lang3.StringUtils; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static java.lang.Math.max; +import static java.lang.Math.sqrt; + +public class AdHocDetector { + private static final int AVG_LENGTH = 1000; + + private final Map termValues = new HashMap<>(); + + public AdHocDetector(List terms) { + PorterStemmer ps = new PorterStemmer(); + + for (String term : terms) { + String[] parts = StringUtils.split(term, ' '); + termValues.put(ps.stemWord(parts[0]), Double.parseDouble(parts[1])); + } + } + + public double testP(DocumentLanguageData dld) { + + Map values = new HashMap<>(); + int count = 0; + for (var sentence : dld) { + + for (var stemmed : sentence.stemmedWords) { + count++; + + final Double value = termValues.get(stemmed); + + if (value != null) { + values.merge(stemmed, value, (a,b) -> 0.5*a + b); + } + } + + } + + if (count == 0) return 0.; + + double lengthPenalty = sqrt(AVG_LENGTH)/sqrt(max(AVG_LENGTH, count)); + + return values.values().stream().mapToDouble(Double::valueOf).sum() * lengthPenalty; + } + +} diff --git a/code/features-convert/topic-detection/java/nu/marginalia/topic/RecipeDetector.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/RecipeDetector.java similarity index 98% rename from code/features-convert/topic-detection/java/nu/marginalia/topic/RecipeDetector.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/RecipeDetector.java index 2a71d27a..83a3a246 100644 --- a/code/features-convert/topic-detection/java/nu/marginalia/topic/RecipeDetector.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/RecipeDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.topic; +package nu.marginalia.converting.processor.classifier.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; @@ -211,12 +211,11 @@ public class RecipeDetector { Map values = new HashMap<>(); int count = 0; - for (var sentence : dld.sentences) { + for (var sentence : dld) { - for (var word : sentence) { + for (var stemmed : sentence.stemmedWords) { count++; - final String stemmed = word.stemmed(); final Double value = termValues.get(stemmed); if (value != null) { diff --git a/code/features-convert/topic-detection/java/nu/marginalia/topic/TextileCraftDetector.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/TextileCraftDetector.java similarity index 97% rename from code/features-convert/topic-detection/java/nu/marginalia/topic/TextileCraftDetector.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/TextileCraftDetector.java index 64ccaf2e..4aa339d2 100644 --- a/code/features-convert/topic-detection/java/nu/marginalia/topic/TextileCraftDetector.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/TextileCraftDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.topic; +package nu.marginalia.converting.processor.classifier.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; @@ -135,12 +135,11 @@ public class TextileCraftDetector { Map values = new HashMap<>(); int count = 0; - for (var sentence : dld.sentences) { + for (var sentence : dld) { - for (var word : sentence) { + for (var stemmed : sentence.stemmedWords) { count++; - final String stemmed = word.stemmed(); final Double value = termValues.get(stemmed); if (value != null) { diff --git a/code/features-convert/topic-detection/java/nu/marginalia/topic/WoodworkingDetector.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/WoodworkingDetector.java similarity index 97% rename from code/features-convert/topic-detection/java/nu/marginalia/topic/WoodworkingDetector.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/WoodworkingDetector.java index 32e362d2..60811d15 100644 --- a/code/features-convert/topic-detection/java/nu/marginalia/topic/WoodworkingDetector.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/classifier/topic/WoodworkingDetector.java @@ -1,4 +1,4 @@ -package nu.marginalia.topic; +package nu.marginalia.converting.processor.classifier.topic; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; @@ -111,12 +111,11 @@ public class WoodworkingDetector { Map values = new HashMap<>(); int count = 0; - for (var sentence : dld.sentences) { + for (var sentence : dld) { - for (var word : sentence) { + for (var stemmed : sentence.stemmedWords) { count++; - final String stemmed = word.stemmed(); final Double value = termValues.get(stemmed); if (value != null) { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java index aae0b24f..856e3407 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentLengthLogic.java @@ -10,7 +10,6 @@ import nu.marginalia.language.model.DocumentLanguageData; public class DocumentLengthLogic { private final int minDocumentLength; - @Inject public DocumentLengthLogic(@Named("min-document-length") Integer minDocumentLength) { this.minDocumentLength = minDocumentLength; @@ -18,7 +17,7 @@ public class DocumentLengthLogic { public int getEncodedAverageLength(DocumentLanguageData dld) { int totalWords = dld.totalNumWords(); - int numSentences = dld.sentences.length; + int numSentences = dld.numSentences(); if (totalWords == 0 || numSentences == 0) { return 0; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java index df409741..1c959dee 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/DocumentValuator.java @@ -1,10 +1,10 @@ package nu.marginalia.converting.processor.logic; import crawlercommons.utils.Strings; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java index c38f63f9..3f08037f 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/FeatureExtractor.java @@ -2,14 +2,14 @@ package nu.marginalia.converting.processor.logic; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.adblock.AdblockSimulator; -import nu.marginalia.adblock.GoogleAnwersSpamDetector; +import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator; +import nu.marginalia.converting.processor.classifier.adblock.GoogleAnwersSpamDetector; +import nu.marginalia.converting.processor.classifier.topic.RecipeDetector; +import nu.marginalia.converting.processor.classifier.topic.TextileCraftDetector; +import nu.marginalia.converting.processor.classifier.topic.WoodworkingDetector; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.topic.RecipeDetector; -import nu.marginalia.topic.TextileCraftDetector; -import nu.marginalia.topic.WoodworkingDetector; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java index 920da41c..b5570b86 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/TitleExtractor.java @@ -39,10 +39,6 @@ public class TitleExtractor { title = getFirstTagText(doc, "h5"); if (title != null) return title; - if (dld.sentences.length > 0) { - return dld.sentences[0].originalSentence; - } - return url; } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java index 39fd3ed2..68819ecf 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/dom/DomPruningFilter.java @@ -18,7 +18,7 @@ public class DomPruningFilter implements NodeFilter { private final double pruneThreshold; - private final Map data = new HashMap<>(); + private final Map data = new HashMap<>(256); private final NodeData dummy = new NodeData(Integer.MAX_VALUE, 1, 0); public DomPruningFilter(double pruneThreshold) { @@ -50,6 +50,12 @@ public class DomPruningFilter implements NodeFilter { } } + if (node instanceof Element el) { + if (shouldAlwaysPurge(el)) { + return FilterResult.REMOVE; + } + } + data.put(node, dataForNode); if (dataForNode.depth <= 1) @@ -62,11 +68,6 @@ public class DomPruningFilter implements NodeFilter { && dataForNode.treeSize > 3) return FilterResult.REMOVE; - if (node instanceof Element el) { - if (shouldAlwaysPurge(el)) { - return FilterResult.REMOVE; - } - } return FilterResult.CONTINUE; } @@ -98,6 +99,8 @@ public class DomPruningFilter implements NodeFilter { return true; if ("iframe".equalsIgnoreCase(tagName)) return true; + if ("noscript".equalsIgnoreCase(tagName)) + return true; if ("footer".equalsIgnoreCase(tagName)) return true; if ("header".equalsIgnoreCase(tagName)) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java index 89043750..4c646dd3 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/logic/links/TopKeywords.java @@ -1,8 +1,8 @@ package nu.marginalia.converting.processor.logic.links; -import nu.marginalia.model.idx.WordFlags; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.idx.WordFlags; import java.util.*; @@ -13,7 +13,7 @@ public class TopKeywords { if (doc.details == null || doc.details.linksInternal == null) return; - List topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.TfIdfHigh.asBit() | WordFlags.Subjects.asBit()); + List topKeywords = doc.words.getWordsWithAnyFlag(WordFlags.Subjects.asBit()); topKeywordsByUrl.put(doc.url, new HashSet<>(topKeywords)); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java index 59b095e7..b03468ca 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/AbstractDocumentProcessorPlugin.java @@ -1,20 +1,23 @@ package nu.marginalia.converting.processor.plugin; -import nu.marginalia.converting.processor.DocumentClass; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.language.filter.LanguageFilter; -import nu.marginalia.language.model.DocumentLanguageData; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; -import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.converting.processor.DocumentClass; +import nu.marginalia.keyword.LinkTexts; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; import javax.annotation.Nullable; import java.net.URISyntaxException; -import java.util.*; +import java.util.HashSet; +import java.util.List; +import java.util.Set; public abstract class AbstractDocumentProcessorPlugin { protected LanguageFilter languageFilter; @@ -22,7 +25,7 @@ public abstract class AbstractDocumentProcessorPlugin { this.languageFilter = languageFilter; } - public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException; + public abstract DetailsWithWords createDetails(CrawledDocument crawledDocument, LinkTexts linkTexts, DocumentClass documentClass) throws DisqualifiedException, URISyntaxException; public abstract boolean isApplicable(CrawledDocument doc); protected void checkDocumentLanguage(DocumentLanguageData dld) throws DisqualifiedException { diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java index 5514fee9..ccb8a383 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/HtmlDocumentProcessorPlugin.java @@ -2,43 +2,46 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.MetaRobotsTag; +import nu.marginalia.converting.processor.logic.*; import nu.marginalia.converting.processor.logic.dom.MeasureLengthVisitor; import nu.marginalia.converting.processor.logic.links.FileLinks; import nu.marginalia.converting.processor.logic.links.LinkProcessor; -import nu.marginalia.converting.processor.plugin.specialization.*; +import nu.marginalia.converting.processor.plugin.specialization.HtmlProcessorSpecializations; +import nu.marginalia.converting.processor.pubdate.PubDateSniffer; +import nu.marginalia.gregex.GuardedRegex; +import nu.marginalia.gregex.GuardedRegexFactory; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.LinkTexts; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.filter.LanguageFilter; import nu.marginalia.language.model.DocumentLanguageData; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import nu.marginalia.link_parser.FeedExtractor; -import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.link_parser.LinkParser; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.converting.processor.logic.*; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.gregex.GuardedRegex; -import nu.marginalia.gregex.GuardedRegexFactory; -import nu.marginalia.converting.model.DisqualifiedException; -import nu.marginalia.converting.model.ProcessedDocumentDetails; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateSniffer; +import nu.marginalia.model.crawl.HtmlFeature; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.net.URISyntaxException; -import java.util.*; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.Set; -import static nu.marginalia.converting.model.DisqualifiedException.*; +import static nu.marginalia.converting.model.DisqualifiedException.DisqualificationReason; public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin { @@ -101,7 +104,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin } @Override - public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) + public DetailsWithWords createDetails(CrawledDocument crawledDocument, + LinkTexts linkTexts, + DocumentClass documentClass) throws DisqualifiedException, URISyntaxException { String documentBody = crawledDocument.documentBody; @@ -130,8 +135,8 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin throw new DisqualifiedException(DisqualificationReason.IRRELEVANT); } - DocumentLanguageData dld = - sentenceExtractorProvider.get().extractSentences(specialization.prune(doc)); + var prunedDoc = specialization.prune(doc); + DocumentLanguageData dld = sentenceExtractorProvider.get().extractSentences(prunedDoc); checkDocumentLanguage(dld); @@ -167,9 +172,9 @@ public class HtmlDocumentProcessorPlugin extends AbstractDocumentProcessorPlugin (int) -ret.quality, // ret.quality is negative documentFlags); - DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); + DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url); - ret.description = specialization.getSummary(doc, words.importantWords); + ret.description = specialization.getSummary(prunedDoc, words.importantWords); ret.generator = generatorParts.type(); var tagWords = new MetaTagsBuilder() diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java index 787cc8a0..2007a5ed 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/PlainTextDocumentProcessorPlugin.java @@ -2,22 +2,23 @@ package nu.marginalia.converting.processor.plugin; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.language.filter.LanguageFilter; -import nu.marginalia.converting.processor.DocumentClass; -import nu.marginalia.converting.processor.logic.DocumentLengthLogic; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.keyword.DocumentKeywordExtractor; -import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.idx.DocumentFlags; -import nu.marginalia.keyword.model.DocumentKeywordsBuilder; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.model.crawl.PubDate; import nu.marginalia.converting.model.DisqualifiedException; import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.converting.processor.DocumentClass; +import nu.marginalia.converting.processor.logic.DocumentLengthLogic; import nu.marginalia.converting.processor.logic.PlainTextLogic; -import nu.marginalia.model.EdgeUrl; import nu.marginalia.converting.util.LineUtils; +import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.LinkTexts; +import nu.marginalia.keyword.model.DocumentKeywordsBuilder; +import nu.marginalia.language.filter.LanguageFilter; +import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.html.HtmlStandard; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.DocumentMetadata; import org.apache.commons.lang3.StringUtils; import java.net.URISyntaxException; @@ -65,7 +66,9 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP } @Override - public DetailsWithWords createDetails(CrawledDocument crawledDocument, DocumentClass documentClass) + public DetailsWithWords createDetails(CrawledDocument crawledDocument, + LinkTexts linkTexts, + DocumentClass documentClass) throws DisqualifiedException, URISyntaxException { String documentBody = crawledDocument.documentBody; @@ -104,7 +107,7 @@ public class PlainTextDocumentProcessorPlugin extends AbstractDocumentProcessorP ret.metadata = new DocumentMetadata(documentLengthLogic.getEncodedAverageLength(dld), pubDate.yearByte(), (int) -ret.quality, documentFlags); - DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, url); + DocumentKeywordsBuilder words = keywordExtractor.extractKeywords(dld, linkTexts, url); var tagWords = new MetaTagsBuilder() .addPubDate(pubDate) diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java index f40654bc..9a699a68 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/BlogSpecialization.java @@ -3,10 +3,10 @@ package nu.marginalia.converting.processor.plugin.specialization; import ca.rmen.porterstemmer.PorterStemmer; import com.google.inject.Inject; import com.google.inject.Singleton; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.summary.SummaryExtractor; import org.apache.logging.log4j.util.Strings; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; @@ -122,6 +122,11 @@ public class BlogSpecialization extends DefaultSpecialization { String classes = el.attr("class"); String id = el.id(); + String tagName = el.tagName(); + + if (tagName.equalsIgnoreCase("noscript")) + return FilterResult.REMOVE; + for (String badClassElement : badClassElements) { if (classes.contains(badClassElement)) { return FilterResult.REMOVE; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/DefaultSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/DefaultSpecialization.java index 5a441639..77f1df12 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/DefaultSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/DefaultSpecialization.java @@ -3,7 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.jsoup.nodes.Document; import java.util.ArrayList; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecialization.java index d930cbd0..38bd415f 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecialization.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; import org.slf4j.Logger; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecialization.java index f85847f4..01ec301c 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/LemmySpecialization.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/MariadbKbSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/MariadbKbSpecialization.java index 3aa35973..26d58775 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/MariadbKbSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/MariadbKbSpecialization.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/PhpBBSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/PhpBBSpecialization.java index 947cc4c0..36584bae 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/PhpBBSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/PhpBBSpecialization.java @@ -3,7 +3,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java index c6107870..5c2fd2e7 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/WikiSpecialization.java @@ -4,7 +4,7 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.keyword.model.DocumentKeywordsBuilder; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecialization.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecialization.java index 16a222b3..af891889 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecialization.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecialization.java @@ -2,7 +2,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import org.jsoup.nodes.Document; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateEffortLevel.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateEffortLevel.java similarity index 50% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateEffortLevel.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateEffortLevel.java index e2fd4e65..47e22ee0 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateEffortLevel.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateEffortLevel.java @@ -1,4 +1,4 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; public enum PubDateEffortLevel { LOW, diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java similarity index 95% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java index dfbab8d3..78c27781 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateFromHtmlStandard.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateFromHtmlStandard.java @@ -1,4 +1,4 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; import nu.marginalia.model.html.HtmlStandard; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java similarity index 87% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java index 56355806..d348c75a 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateParser.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java similarity index 99% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateParser.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java index 1fbade80..5b139e30 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateParser.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateParser.java @@ -1,7 +1,7 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; -import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import java.time.DateTimeException; import java.time.LocalDate; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateSniffer.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java similarity index 93% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateSniffer.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java index 90b25915..4ec1c4f9 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/PubDateSniffer.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/PubDateSniffer.java @@ -1,9 +1,9 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.converting.processor.pubdate.heuristic.*; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.heuristic.*; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.ArrayList; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java similarity index 94% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java index 28059f64..5ab86c17 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass1.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java similarity index 91% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java index bb625180..eb42a3c4 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicDOMParsingPass2.java @@ -1,12 +1,12 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateEffortLevel; -import nu.marginalia.pubdate.PubDateFromHtmlStandard; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateFromHtmlStandard; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jetbrains.annotations.NotNull; import org.jsoup.nodes.Document; import org.jsoup.nodes.Node; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java similarity index 69% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java index 30486f2f..cffbe178 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicGuessFromHtmlStandard.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateEffortLevel; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java similarity index 77% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java index 30513a47..1d4d6a90 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5AnyTimeTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java similarity index 73% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java index 45c8b091..e484e40b 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ArticleDateTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java similarity index 73% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java index aa09d392..0cedf842 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicHtml5ItempropDateTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java similarity index 89% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java index 3ddf58eb..27d25208 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicJSONLD.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicJSONLD.java @@ -1,16 +1,16 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; import com.google.gson.Gson; import com.google.gson.GsonBuilder; import com.google.gson.JsonSyntaxException; import com.google.gson.annotations.SerializedName; import lombok.ToString; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Collections; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java similarity index 75% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java index ca42d469..0bc1a4bc 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicLastModified.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicLastModified.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java similarity index 73% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java index 584375f2..04858bbd 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicMicrodata.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicMicrodata.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java similarity index 73% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java index 74a7a654..0c1bc6d3 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicOpenGraph.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicOpenGraph.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateEffortLevel; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java similarity index 72% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java index 1ed20019..a158bd9a 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicRDFaTag.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicRDFaTag.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateEffortLevel; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java similarity index 83% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java index 6a6d5630..16a55c5f 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass1.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java similarity index 82% rename from code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java index ea3ab9d9..e5226266 100644 --- a/code/features-convert/pubdate/java/nu/marginalia/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/pubdate/heuristic/PubDateHeuristicUrlPatternPass2.java @@ -1,11 +1,11 @@ -package nu.marginalia.pubdate.heuristic; +package nu.marginalia.converting.processor.pubdate.heuristic; -import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.model.crawl.PubDate; -import nu.marginalia.pubdate.PubDateHeuristic; -import nu.marginalia.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateEffortLevel; +import nu.marginalia.converting.processor.pubdate.PubDateHeuristic; +import nu.marginalia.converting.processor.pubdate.PubDateParser; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.pubdate.PubDateEffortLevel; +import nu.marginalia.model.crawl.PubDate; +import nu.marginalia.model.html.HtmlStandard; import org.jsoup.nodes.Document; import java.util.Optional; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/SummaryExtractor.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/SummaryExtractor.java similarity index 94% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/SummaryExtractor.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/SummaryExtractor.java index 0e422390..7a9bd3da 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/SummaryExtractor.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/SummaryExtractor.java @@ -1,8 +1,8 @@ -package nu.marginalia.summary; +package nu.marginalia.converting.processor.summary; import com.google.inject.Inject; import com.google.inject.name.Named; -import nu.marginalia.summary.heuristic.*; +import nu.marginalia.converting.processor.summary.heuristic.*; import org.apache.commons.lang3.StringUtils; import org.jsoup.nodes.Document; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java similarity index 87% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java index a06d4408..30d9ccc9 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/DomFilterHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/DomFilterHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import com.google.inject.Inject; import com.google.inject.name.Named; @@ -20,7 +20,7 @@ public class DomFilterHeuristic implements SummaryHeuristic { var filter = new SummarizingDOMFilter(); - doc.filter(filter); + doc.body().filter(filter); return filter.getSummary( maxSummaryLength+32, diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/FallbackHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/FallbackHeuristic.java similarity index 92% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/FallbackHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/FallbackHeuristic.java index caf37137..53d5c656 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/FallbackHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/FallbackHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/HeuristicTextUtil.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/HeuristicTextUtil.java similarity index 98% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/HeuristicTextUtil.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/HeuristicTextUtil.java index 6beac2eb..3c7bfa9f 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/HeuristicTextUtil.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/HeuristicTextUtil.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import org.apache.commons.lang3.StringUtils; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/MetaDescriptionHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/MetaDescriptionHeuristic.java similarity index 83% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/MetaDescriptionHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/MetaDescriptionHeuristic.java index d48b6c3b..4ccdc09b 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/MetaDescriptionHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/MetaDescriptionHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import org.jsoup.nodes.Document; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/OpenGraphDescriptionHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/OpenGraphDescriptionHeuristic.java similarity index 83% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/OpenGraphDescriptionHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/OpenGraphDescriptionHeuristic.java index 70f56bd3..4bcfd8e6 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/OpenGraphDescriptionHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/OpenGraphDescriptionHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import org.jsoup.nodes.Document; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/SummarizingDOMFilter.java similarity index 97% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/SummarizingDOMFilter.java index f72b0eae..ab327744 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/SummarizingDOMFilter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/SummarizingDOMFilter.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import com.google.common.base.Strings; import org.apache.commons.lang3.StringUtils; @@ -10,7 +10,6 @@ import org.jsoup.select.NodeFilter; import java.util.*; import java.util.function.Function; -import static nu.marginalia.summary.heuristic.HeuristicTextUtil.countOccurrencesOfAnyWord; import static org.jsoup.internal.StringUtil.isActuallyWhitespace; import static org.jsoup.internal.StringUtil.isInvisibleChar; @@ -107,8 +106,8 @@ public class SummarizingDOMFilter implements NodeFilter { if (wholeText.length() > 128) return 0; - return countOccurrencesOfAnyWord(wholeText, importantWords) - - countOccurrencesOfAnyWord(wholeText, badWords); + return HeuristicTextUtil.countOccurrencesOfAnyWord(wholeText, importantWords) + - HeuristicTextUtil.countOccurrencesOfAnyWord(wholeText, badWords); }); if (cnt > 0) { diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/SummaryHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/SummaryHeuristic.java similarity index 73% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/SummaryHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/SummaryHeuristic.java index 54b1c33a..c3cef4bb 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/SummaryHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/SummaryHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import org.jsoup.nodes.Document; diff --git a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/TagDensityHeuristic.java b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/TagDensityHeuristic.java similarity index 96% rename from code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/TagDensityHeuristic.java rename to code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/TagDensityHeuristic.java index 170afec0..dfea3709 100644 --- a/code/features-convert/summary-extraction/java/nu/marginalia/summary/heuristic/TagDensityHeuristic.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/processor/summary/heuristic/TagDensityHeuristic.java @@ -1,4 +1,4 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; import com.google.inject.Inject; import com.google.inject.name.Named; diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java index 32a0ec62..b7cf244b 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/SideloaderProcessing.java @@ -7,11 +7,12 @@ import nu.marginalia.converting.model.GeneratorType; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.processor.plugin.HtmlDocumentProcessorPlugin; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.keyword.LinkTexts; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; @@ -37,6 +38,7 @@ public class SideloaderProcessing { DomainLinks domainLinks, GeneratorType type, DocumentClass documentClass, + LinkTexts linkTexts, int pubYear, int size) throws URISyntaxException { var crawledDoc = new CrawledDocument( @@ -64,12 +66,12 @@ public class SideloaderProcessing { var ret = new ProcessedDocument(); try { - var details = htmlProcessorPlugin.createDetails(crawledDoc, documentClass); + var details = htmlProcessorPlugin.createDetails(crawledDoc, linkTexts, documentClass); ret.words = details.words(); for (String keyword : extraKeywords) - ret.words.add(keyword, WordFlags.Subjects.asBit()); + ret.words.addMeta(keyword, WordFlags.Subjects.asBit()); if (type == GeneratorType.WIKI) { ret.words.addAllSyntheticTerms(List.of("generator:wiki")); diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java index 252f9086..f82fa02b 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/dirtree/DirtreeSideloader.java @@ -8,6 +8,7 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloaderProcessing; +import nu.marginalia.keyword.LinkTexts; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; @@ -86,6 +87,7 @@ public class DirtreeSideloader implements SideloadSource, AutoCloseable { .processDocument(url, body, extraKeywords, new DomainLinks(), GeneratorType.DOCS, DocumentClass.NORMAL, + new LinkTexts(), LocalDate.now().getYear(), 10_000); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java index 17c83250..dae8f499 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/encyclopedia/EncyclopediaMarginaliaNuSideloader.java @@ -28,7 +28,9 @@ import java.net.URISyntaxException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.nio.file.Path; -import java.sql.*; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.SQLException; import java.time.LocalDate; import java.util.Iterator; import java.util.List; @@ -135,16 +137,10 @@ public class EncyclopediaMarginaliaNuSideloader implements SideloadSource, AutoC domainLinks, GeneratorType.WIKI, DocumentClass.SIDELOAD, + anchorTextKeywords.getAnchorTextKeywords(domainLinks, new EdgeUrl(fullUrl)), LocalDate.now().getYear(), 10_000_000); - // Add anchor text keywords - if (doc.isProcessedFully()) { - doc.words.addAnchorTerms( - anchorTextKeywords.getAnchorTextKeywords(domainLinks, doc.url) - ); - } - return doc; } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java index 2d2dc9a5..c2536f77 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/reddit/RedditSideloader.java @@ -142,6 +142,12 @@ public class RedditSideloader implements SideloadSource { extraKeywords.add(author); } + List urls = List.of( + new EdgeUrl("https://old.reddit.com/r/" + permalink), + new EdgeUrl("https://www.reddit.com/r/" + permalink), + new EdgeUrl("https://reddit.com/r/" + permalink) + ); + var doc = sideloaderProcessing .processDocument(fullUrl, fullHtml, @@ -149,24 +155,14 @@ public class RedditSideloader implements SideloadSource { domainLinks, GeneratorType.FORUM, DocumentClass.SIDELOAD, + anchorTextKeywords.getAnchorTextKeywords(domainLinks, urls), pubYear, 10_000_000); if (doc.isProcessedFully()) { - for (String url : List.of( - STR."https://old.reddit.com/r/\{permalink}", - STR."https://www.reddit.com/r/\{permalink}", - STR."https://reddit.com/r/\{permalink}" - )) { - EdgeUrl.parse(url) - .map(parsed -> anchorTextKeywords.getAnchorTextKeywords(domainLinks, parsed)) - .filter(parsed -> !parsed.isEmpty()) - .ifPresent(doc.words::addAnchorTerms); - } - - for (var keyword : extraKeywords) { - doc.words.add(keyword, WordFlags.Subjects.asBit()); + for (var keyword : extraKeywords) { + doc.words.addMeta(keyword, WordFlags.Subjects.asBit()); } // Insert topology information diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java index 53be14aa..7baabee6 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/stackexchange/StackexchangeSideloader.java @@ -1,10 +1,14 @@ package nu.marginalia.converting.sideload.stackexchange; import lombok.SneakyThrows; -import nu.marginalia.converting.model.*; +import nu.marginalia.converting.model.GeneratorType; +import nu.marginalia.converting.model.ProcessedDocument; +import nu.marginalia.converting.model.ProcessedDocumentDetails; +import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.integration.stackexchange.sqlite.StackExchangePostsDb; import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.LinkTexts; import nu.marginalia.language.sentence.ThreadLocalSentenceExtractorProvider; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; @@ -122,7 +126,7 @@ public class StackexchangeSideloader implements SideloadSource { var dld = sentenceExtractorProvider.get().extractSentences(doc); ret.url = url; - ret.words = keywordExtractor.extractKeywords(dld, url); + ret.words = keywordExtractor.extractKeywords(dld, new LinkTexts(), url); ret.words.addAllSyntheticTerms(List.of( "site:" + domainName, "site:" + url.domain.topDomain, diff --git a/code/processes/converting-process/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java b/code/processes/converting-process/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java index 791f0665..a645e485 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/sideload/warc/WarcSideloader.java @@ -10,6 +10,7 @@ import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.processor.DocumentClass; import nu.marginalia.converting.sideload.SideloadSource; import nu.marginalia.converting.sideload.SideloaderProcessing; +import nu.marginalia.keyword.LinkTexts; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; @@ -138,6 +139,7 @@ public class WarcSideloader implements SideloadSource, AutoCloseable { new DomainLinks(), GeneratorType.DOCS, DocumentClass.SIDELOAD, + new LinkTexts(), LocalDate.now().getYear(), // TODO: This should be the actual year of the document 10_000)); } diff --git a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java index 14972693..1f305246 100644 --- a/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java +++ b/code/processes/converting-process/java/nu/marginalia/converting/writer/ConverterBatchWriter.java @@ -1,50 +1,50 @@ package nu.marginalia.converting.writer; -import gnu.trove.list.TLongList; -import gnu.trove.list.array.TLongArrayList; import lombok.SneakyThrows; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.model.ProcessedDomain; import nu.marginalia.converting.sideload.SideloadSource; -import nu.marginalia.io.processed.DocumentRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainRecordParquetFileWriter; import nu.marginalia.io.processed.ProcessedDataFileNames; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.HtmlFeature; -import nu.marginalia.model.processed.DocumentRecord; -import nu.marginalia.model.processed.DomainLinkRecord; -import nu.marginalia.model.processed.DomainRecord; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainRecord; +import nu.marginalia.sequence.VarintCodedSequence; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; import java.nio.file.Path; import java.util.*; -import java.util.concurrent.Callable; -import java.util.concurrent.ForkJoinPool; -import java.util.concurrent.Future; /** Writer for a single batch of converter parquet files */ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriterIf { - private final DomainRecordParquetFileWriter domainWriter; - private final DomainLinkRecordParquetFileWriter domainLinkWriter; - private final DocumentRecordParquetFileWriter documentWriter; + private final SlopDomainRecord.Writer domainWriter; + private final SlopDomainLinkRecord.Writer domainLinkWriter; + private final SlopDocumentRecord.Writer documentWriter; private static final Logger logger = LoggerFactory.getLogger(ConverterBatchWriter.class); public ConverterBatchWriter(Path basePath, int batchNumber) throws IOException { - domainWriter = new DomainRecordParquetFileWriter( - ProcessedDataFileNames.domainFileName(basePath, batchNumber) - ); - domainLinkWriter = new DomainLinkRecordParquetFileWriter( - ProcessedDataFileNames.domainLinkFileName(basePath, batchNumber) - ); - documentWriter = new DocumentRecordParquetFileWriter( - ProcessedDataFileNames.documentFileName(basePath, batchNumber) - ); + if (!Files.exists(ProcessedDataFileNames.domainFileName(basePath))) { + Files.createDirectory(ProcessedDataFileNames.domainFileName(basePath)); + } + domainWriter = new SlopDomainRecord.Writer(ProcessedDataFileNames.domainFileName(basePath), batchNumber); + + if (!Files.exists(ProcessedDataFileNames.domainLinkFileName(basePath))) { + Files.createDirectory(ProcessedDataFileNames.domainLinkFileName(basePath)); + } + domainLinkWriter = new SlopDomainLinkRecord.Writer(ProcessedDataFileNames.domainLinkFileName(basePath), batchNumber); + + if (!Files.exists(ProcessedDataFileNames.documentFileName(basePath))) { + Files.createDirectory(ProcessedDataFileNames.documentFileName(basePath)); + } + documentWriter = new SlopDocumentRecord.Writer(ProcessedDataFileNames.documentFileName(basePath), batchNumber); } @Override @@ -57,39 +57,23 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter var domain = sideloadSource.getDomain(); writeDomainData(domain); - writeDocumentData(domain.domain, sideloadSource.getDocumentsStream()); } @Override @SneakyThrows public void writeProcessedDomain(ProcessedDomain domain) { - var results = ForkJoinPool.commonPool().invokeAll( - writeTasks(domain) - ); - - for (var result : results) { - if (result.state() == Future.State.FAILED) { - logger.warn("Parquet writing job failed", result.exceptionNow()); + try { + if (domain.documents != null) { + writeDocumentData(domain.domain, domain.documents.iterator()); } + writeLinkData(domain); + writeDomainData(domain); + } + catch (IOException e) { + logger.error("Data writing job failed", e); } - } - private List> writeTasks(ProcessedDomain domain) { - return List.of( - () -> writeDocumentData(domain), - () -> writeLinkData(domain), - () -> writeDomainData(domain) - ); - } - - private Object writeDocumentData(ProcessedDomain domain) throws IOException { - if (domain.documents == null) - return this; - - writeDocumentData(domain.domain, domain.documents.iterator()); - - return this; } private void writeDocumentData(EdgeDomain domain, @@ -101,53 +85,49 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter String domainName = domain.toString(); + ByteBuffer workArea = ByteBuffer.allocate(16384); + while (documentIterator.hasNext()) { var document = documentIterator.next(); - if (document.details == null) { - new DocumentRecord( - domainName, - document.url.toString(), - ordinal, - document.state.toString(), - document.stateReason, - null, - null, - 0, - null, - 0, - 0L, - -15, - 0L, - null, - null, - null); - } - else { - var wb = document.words.build(); - List words = Arrays.asList(wb.keywords); - TLongList metas = new TLongArrayList(wb.metadata); - - documentWriter.write(new DocumentRecord( - domainName, - document.url.toString(), - ordinal, - document.state.toString(), - document.stateReason, - document.details.title, - document.details.description, - HtmlFeature.encode(document.details.features), - document.details.standard.name(), - document.details.length, - document.details.hashCode, - (float) document.details.quality, - document.details.metadata.encode(), - document.details.pubYear, - words, - metas - )); + if (document.details == null || document.words == null) { + continue; } + var wb = document.words.build(workArea); + + List spanSequences = new ArrayList<>(wb.spans.size()); + byte[] spanCodes = new byte[wb.spans.size()]; + + for (int i = 0; i < wb.spans.size(); i++) { + var span = wb.spans.get(i); + + spanCodes[i] = span.code(); + spanSequences.add(span.spans()); + } + + documentWriter.write(new SlopDocumentRecord( + domainName, + document.url.toString(), + ordinal, + document.state.toString(), + document.stateReason, + document.details.title, + document.details.description, + HtmlFeature.encode(document.details.features), + document.details.standard.name(), + document.details.length, + document.details.hashCode, + (float) document.details.quality, + document.details.metadata.encode(), + document.details.pubYear, + wb.keywords, + wb.metadata, + wb.positions, + spanCodes, + spanSequences + )); + ordinal++; } @@ -172,7 +152,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter continue; } - domainLinkWriter.write(new DomainLinkRecord( + domainLinkWriter.write(new SlopDomainLinkRecord( from, dest.toString() )); @@ -180,7 +160,7 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter } if (domain.redirect != null) { - domainLinkWriter.write(new DomainLinkRecord( + domainLinkWriter.write(new SlopDomainLinkRecord( from, domain.redirect.toString() )); @@ -195,13 +175,13 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter List feeds = getFeedUrls(domain); domainWriter.write( - new DomainRecord( + new SlopDomainRecord( domain.domain.toString(), metadata.known(), metadata.good(), metadata.visited(), - Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(null), - Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(null), + Optional.ofNullable(domain.state).map(DomainIndexingState::toString).orElse(""), + Optional.ofNullable(domain.redirect).map(EdgeDomain::toString).orElse(""), domain.ip, feeds ) diff --git a/code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/RedditEntryReader.java b/code/processes/converting-process/java/nu/marginalia/integration/reddit/RedditEntryReader.java similarity index 100% rename from code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/RedditEntryReader.java rename to code/processes/converting-process/java/nu/marginalia/integration/reddit/RedditEntryReader.java diff --git a/code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/db/RedditDb.java b/code/processes/converting-process/java/nu/marginalia/integration/reddit/db/RedditDb.java similarity index 100% rename from code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/db/RedditDb.java rename to code/processes/converting-process/java/nu/marginalia/integration/reddit/db/RedditDb.java diff --git a/code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/ProcessableRedditComment.java b/code/processes/converting-process/java/nu/marginalia/integration/reddit/model/ProcessableRedditComment.java similarity index 100% rename from code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/ProcessableRedditComment.java rename to code/processes/converting-process/java/nu/marginalia/integration/reddit/model/ProcessableRedditComment.java diff --git a/code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/ProcessableRedditSubmission.java b/code/processes/converting-process/java/nu/marginalia/integration/reddit/model/ProcessableRedditSubmission.java similarity index 100% rename from code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/ProcessableRedditSubmission.java rename to code/processes/converting-process/java/nu/marginalia/integration/reddit/model/ProcessableRedditSubmission.java diff --git a/code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/RawRedditComment.java b/code/processes/converting-process/java/nu/marginalia/integration/reddit/model/RawRedditComment.java similarity index 100% rename from code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/RawRedditComment.java rename to code/processes/converting-process/java/nu/marginalia/integration/reddit/model/RawRedditComment.java diff --git a/code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/RawRedditSubmission.java b/code/processes/converting-process/java/nu/marginalia/integration/reddit/model/RawRedditSubmission.java similarity index 100% rename from code/features-convert/reddit-json/java/nu/marginalia/integration/reddit/model/RawRedditSubmission.java rename to code/processes/converting-process/java/nu/marginalia/integration/reddit/model/RawRedditSubmission.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/model/StackExchangeComment.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/model/StackExchangePost.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/sqlite/StackExchangePostsDb.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchange7zXmlEventReaderSource.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReader.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlIterator.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReader.java diff --git a/code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java b/code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java similarity index 100% rename from code/features-convert/stackexchange-xml/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java rename to code/processes/converting-process/java/nu/marginalia/integration/stackexchange/xml/XmlEventReaderSource.java diff --git a/code/process-models/processed-data/build.gradle b/code/processes/converting-process/model/build.gradle similarity index 69% rename from code/process-models/processed-data/build.gradle rename to code/processes/converting-process/model/build.gradle index 04ee95de..14beb987 100644 --- a/code/process-models/processed-data/build.gradle +++ b/code/processes/converting-process/model/build.gradle @@ -12,17 +12,23 @@ java { } apply from: "$rootProject.projectDir/srcsets.gradle" +jar.archiveBaseName = 'converting-process-model' + dependencies { implementation libs.bundles.slf4j implementation project(':third-party:parquet-floor') + implementation project(':code:libraries:coded-sequence') + implementation libs.slop implementation libs.notnull + implementation libs.roaringbitmap implementation libs.trove implementation libs.bundles.parquet testImplementation libs.bundles.slf4j.test testImplementation libs.bundles.junit testImplementation libs.mockito + testImplementation project(':code:libraries:test-helpers') } diff --git a/code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java b/code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java new file mode 100644 index 00000000..44b56bc3 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/io/processed/ProcessedDataFileNames.java @@ -0,0 +1,16 @@ +package nu.marginalia.io.processed; + +import java.nio.file.Path; + +public class ProcessedDataFileNames { + public static Path documentFileName(Path base) { + return base.resolve("document"); + } + public static Path domainFileName(Path base) { + return base.resolve("domains"); + } + public static Path domainLinkFileName(Path base) { + return base.resolve("domain-link"); + } + +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java new file mode 100644 index 00000000..dacb1f60 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDocumentRecord.java @@ -0,0 +1,334 @@ +package nu.marginalia.model.processed; + +import lombok.Builder; +import nu.marginalia.sequence.VarintCodedSequence; +import nu.marginalia.sequence.slop.VarintCodedSequenceArrayColumn; +import nu.marginalia.slop.SlopTable; +import nu.marginalia.slop.column.array.ByteArrayColumn; +import nu.marginalia.slop.column.array.ObjectArrayColumn; +import nu.marginalia.slop.column.dynamic.VarintColumn; +import nu.marginalia.slop.column.primitive.FloatColumn; +import nu.marginalia.slop.column.primitive.IntColumn; +import nu.marginalia.slop.column.primitive.LongColumn; +import nu.marginalia.slop.column.string.EnumColumn; +import nu.marginalia.slop.column.string.StringColumn; +import nu.marginalia.slop.column.string.TxtStringColumn; +import nu.marginalia.slop.desc.StorageType; +import org.jetbrains.annotations.Nullable; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +public record SlopDocumentRecord( + String domain, + String url, + int ordinal, + String state, + String stateReason, + String title, + String description, + int htmlFeatures, + String htmlStandard, + int length, + long hash, + float quality, + long documentMetadata, + Integer pubYear, + List words, + byte[] metas, + List positions, + byte[] spanCodes, + List spans +) { + + public SlopDocumentRecord { + if (spanCodes.length != spans.size()) + throw new IllegalArgumentException("Span codes and spans must have the same length"); + if (metas.length != words.size() || metas.length != positions.size()) + throw new IllegalArgumentException("Metas, words and positions must have the same length"); + } + + @Builder + public record KeywordsProjection( + String domain, + int ordinal, + int htmlFeatures, + long documentMetadata, + int length, + List words, + byte[] metas, + List positions, + byte[] spanCodes, + List spans) + { + // Override the equals method since records don't generate default equals that deal with array fields properly + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof KeywordsProjection that)) return false; + + return length == that.length && ordinal == that.ordinal && htmlFeatures == that.htmlFeatures && documentMetadata == that.documentMetadata && Arrays.equals(metas, that.metas) && Objects.equals(domain, that.domain) && Arrays.equals(spanCodes, that.spanCodes) && Objects.equals(words, that.words) && Objects.equals(spans, that.spans) && Objects.equals(positions, that.positions); + } + + @Override + public int hashCode() { + int result = Objects.hashCode(domain); + result = 31 * result + ordinal; + result = 31 * result + htmlFeatures; + result = 31 * result + Long.hashCode(documentMetadata); + result = 31 * result + length; + result = 31 * result + Objects.hashCode(words); + result = 31 * result + Arrays.hashCode(metas); + result = 31 * result + Objects.hashCode(positions); + result = 31 * result + Arrays.hashCode(spanCodes); + result = 31 * result + Objects.hashCode(spans); + return result; + } + } + + public record MetadataProjection( + String domain, + String url, + int ordinal, + String title, + String description, + int htmlFeatures, + String htmlStandard, + int length, + long hash, + float quality, + Integer pubYear + ) { + + } + + // Basic information + private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP); + private static final TxtStringColumn urlsColumn = new TxtStringColumn("url", StandardCharsets.UTF_8, StorageType.GZIP); + private static final VarintColumn ordinalsColumn = new VarintColumn("ordinal", StorageType.PLAIN); + private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN); + private static final StringColumn stateReasonsColumn = new StringColumn("stateReason", StandardCharsets.US_ASCII, StorageType.GZIP); + + // Document metadata + private static final StringColumn titlesColumn = new StringColumn("title", StandardCharsets.UTF_8, StorageType.GZIP); + private static final StringColumn descriptionsColumn = new StringColumn("description", StandardCharsets.UTF_8, StorageType.GZIP); + private static final EnumColumn htmlStandardsColumn = new EnumColumn("htmlStandard", StandardCharsets.UTF_8, StorageType.PLAIN); + private static final IntColumn htmlFeaturesColumn = new IntColumn("htmlFeatures", StorageType.PLAIN); + private static final IntColumn lengthsColumn = new IntColumn("length", StorageType.PLAIN); + private static final IntColumn pubYearColumn = new IntColumn("pubYear", StorageType.PLAIN); + private static final LongColumn hashesColumn = new LongColumn("hash", StorageType.PLAIN); + private static final FloatColumn qualitiesColumn = new FloatColumn("quality", StorageType.PLAIN); + private static final LongColumn domainMetadata = new LongColumn("domainMetadata", StorageType.PLAIN); + + // Keyword-level columns, these are enumerated by the counts column + + private static final ObjectArrayColumn keywordsColumn = new StringColumn("keywords", StandardCharsets.UTF_8, StorageType.ZSTD).asArray(); + private static final ByteArrayColumn termMetaColumn = new ByteArrayColumn("termMetadata", StorageType.ZSTD); + private static final VarintCodedSequenceArrayColumn termPositionsColumn = new VarintCodedSequenceArrayColumn("termPositions", StorageType.ZSTD); + + // Spans columns + + private static final ByteArrayColumn spanCodesColumn = new ByteArrayColumn("spanCodes", StorageType.ZSTD); + private static final VarintCodedSequenceArrayColumn spansColumn = new VarintCodedSequenceArrayColumn("spans", StorageType.ZSTD); + + public static class KeywordsProjectionReader extends SlopTable { + private final TxtStringColumn.Reader domainsReader; + private final VarintColumn.Reader ordinalsReader; + private final IntColumn.Reader htmlFeaturesReader; + private final LongColumn.Reader domainMetadataReader; + private final IntColumn.Reader lengthsReader; + + private final ObjectArrayColumn.Reader keywordsReader; + private final ByteArrayColumn.Reader termMetaReader; + private final VarintCodedSequenceArrayColumn.Reader termPositionsReader; + + private final ByteArrayColumn.Reader spanCodesReader; + private final VarintCodedSequenceArrayColumn.Reader spansReader; + + public KeywordsProjectionReader(SlopTable.Ref pageRef) throws IOException { + super(pageRef); + domainsReader = domainsColumn.open(this); + ordinalsReader = ordinalsColumn.open(this); + htmlFeaturesReader = htmlFeaturesColumn.open(this); + domainMetadataReader = domainMetadata.open(this); + lengthsReader = lengthsColumn.open(this); + + keywordsReader = keywordsColumn.open(this); + termMetaReader = termMetaColumn.open(this); + termPositionsReader = termPositionsColumn.open(this); + + spanCodesReader = spanCodesColumn.open(this); + spansReader = spansColumn.open(this); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + @Nullable + public KeywordsProjection next() throws IOException { + String domain = domainsReader.get(); + int ordinal = ordinalsReader.get(); + int htmlFeatures = htmlFeaturesReader.get(); + long documentMetadata = domainMetadataReader.get(); + int length = lengthsReader.get(); + + List words = keywordsReader.get(); + List positions = termPositionsReader.get(); + byte[] metas = termMetaReader.get(); + byte[] spanCodes = spanCodesReader.get(); + List spans = spansReader.get(); + + return new KeywordsProjection( + domain, + ordinal, + htmlFeatures, + documentMetadata, + length, + words, + metas, + positions, + spanCodes, + spans + ); + } + + } + + public static class MetadataReader extends SlopTable { + private final TxtStringColumn.Reader domainsReader; + private final TxtStringColumn.Reader urlsReader; + private final VarintColumn.Reader ordinalsReader; + private final StringColumn.Reader titlesReader; + private final StringColumn.Reader descriptionsReader; + + private final IntColumn.Reader htmlFeaturesReader; + private final EnumColumn.Reader htmlStandardsReader; + private final IntColumn.Reader lengthsReader; + private final LongColumn.Reader hashesReader; + private final FloatColumn.Reader qualitiesReader; + private final IntColumn.Reader pubYearReader; + + public MetadataReader(SlopTable.Ref pageRef) throws IOException{ + super(pageRef); + + this.domainsReader = domainsColumn.open(this); + this.urlsReader = urlsColumn.open(this); + this.ordinalsReader = ordinalsColumn.open(this); + this.titlesReader = titlesColumn.open(this); + this.descriptionsReader = descriptionsColumn.open(this); + this.htmlFeaturesReader = htmlFeaturesColumn.open(this); + this.htmlStandardsReader = htmlStandardsColumn.open(this); + this.lengthsReader = lengthsColumn.open(this); + this.hashesReader = hashesColumn.open(this); + this.qualitiesReader = qualitiesColumn.open(this); + this.pubYearReader = pubYearColumn.open(this); + } + + public MetadataReader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public MetadataProjection next() throws IOException { + int pubYear = pubYearReader.get(); + return new MetadataProjection( + domainsReader.get(), + urlsReader.get(), + ordinalsReader.get(), + titlesReader.get(), + descriptionsReader.get(), + htmlFeaturesReader.get(), + htmlStandardsReader.get(), + lengthsReader.get(), + hashesReader.get(), + qualitiesReader.get(), + pubYear < 0 ? null : pubYear + ); + } + + } + + public static class Writer extends SlopTable { + private final TxtStringColumn.Writer domainsWriter; + private final TxtStringColumn.Writer urlsWriter; + private final VarintColumn.Writer ordinalsWriter; + private final EnumColumn.Writer statesWriter; + private final StringColumn.Writer stateReasonsWriter; + private final StringColumn.Writer titlesWriter; + private final StringColumn.Writer descriptionsWriter; + private final IntColumn.Writer htmlFeaturesWriter; + private final EnumColumn.Writer htmlStandardsWriter; + private final IntColumn.Writer lengthsWriter; + private final LongColumn.Writer hashesWriter; + private final FloatColumn.Writer qualitiesWriter; + private final LongColumn.Writer domainMetadataWriter; + private final IntColumn.Writer pubYearWriter; + private final ObjectArrayColumn.Writer keywordsWriter; + private final ByteArrayColumn.Writer termMetaWriter; + private final VarintCodedSequenceArrayColumn.Writer termPositionsWriter; + private final ByteArrayColumn.Writer spansCodesWriter; + private final VarintCodedSequenceArrayColumn.Writer spansWriter; + + public Writer(Path baseDir, int page) throws IOException { + super(baseDir, page); + + domainsWriter = domainsColumn.create(this); + urlsWriter = urlsColumn.create(this); + ordinalsWriter = ordinalsColumn.create(this); + statesWriter = statesColumn.create(this); + stateReasonsWriter = stateReasonsColumn.create(this); + titlesWriter = titlesColumn.create(this); + descriptionsWriter = descriptionsColumn.create(this); + htmlFeaturesWriter = htmlFeaturesColumn.create(this); + htmlStandardsWriter = htmlStandardsColumn.create(this); + lengthsWriter = lengthsColumn.create(this); + hashesWriter = hashesColumn.create(this); + qualitiesWriter = qualitiesColumn.create(this); + domainMetadataWriter = domainMetadata.create(this); + pubYearWriter = pubYearColumn.create(this); + + keywordsWriter = keywordsColumn.create(this); + termMetaWriter = termMetaColumn.create(this); + termPositionsWriter = termPositionsColumn.create(this); + + spansCodesWriter = spanCodesColumn.create(this); + spansWriter = spansColumn.create(this); + } + + public void write(SlopDocumentRecord record) throws IOException { + domainsWriter.put(record.domain()); + urlsWriter.put(record.url()); + ordinalsWriter.put(record.ordinal()); + statesWriter.put(record.state()); + stateReasonsWriter.put(record.stateReason()); + titlesWriter.put(record.title()); + descriptionsWriter.put(record.description()); + htmlFeaturesWriter.put(record.htmlFeatures()); + htmlStandardsWriter.put(record.htmlStandard()); + lengthsWriter.put(record.length()); + hashesWriter.put(record.hash()); + qualitiesWriter.put(record.quality()); + domainMetadataWriter.put(record.documentMetadata()); + + if (record.pubYear == null) { + pubYearWriter.put(-1); + } else { + pubYearWriter.put(record.pubYear()); + } + + keywordsWriter.put(record.words()); + termMetaWriter.put(record.metas()); + termPositionsWriter.put(record.positions()); + spansCodesWriter.put(record.spanCodes()); + spansWriter.put(record.spans()); + } + } +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java new file mode 100644 index 00000000..9a43dbf0 --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainLinkRecord.java @@ -0,0 +1,79 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.slop.SlopTable; +import nu.marginalia.slop.column.string.TxtStringColumn; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.function.BiConsumer; + +public record SlopDomainLinkRecord( + String source, + String dest) +{ + private static final TxtStringColumn sourcesColumn = new TxtStringColumn("source", StandardCharsets.UTF_8, StorageType.GZIP); + private static final TxtStringColumn destsColumn = new TxtStringColumn("dest", StandardCharsets.UTF_8, StorageType.GZIP); + + public static Reader reader(Path baseDir, int page) throws IOException { + return new Reader(baseDir, page); + } + + public static class Reader extends SlopTable { + private final TxtStringColumn.Reader sourcesReader; + private final TxtStringColumn.Reader destsReader; + + public Reader(SlopTable.Ref ref) throws IOException { + super(ref); + + sourcesReader = sourcesColumn.open(this); + destsReader = destsColumn.open(this); + } + + public Reader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + + public boolean hasMore() throws IOException { + return sourcesReader.hasRemaining(); + } + + public void forEach(BiConsumer recordConsumer) throws IOException { + while (hasMore()) { + recordConsumer.accept(sourcesReader.get(), destsReader.get()); + } + } + + public SlopDomainLinkRecord next() throws IOException { + + return new SlopDomainLinkRecord( + sourcesReader.get(), + destsReader.get() + ); + } + } + + public static class Writer extends SlopTable { + private final TxtStringColumn.Writer sourcesWriter; + private final TxtStringColumn.Writer destsWriter; + + public Writer(Path baseDir, int page) throws IOException { + super(baseDir, page); + + sourcesWriter = sourcesColumn.create(this); + destsWriter = destsColumn.create(this); + } + + public void write(SlopDomainLinkRecord record) throws IOException { + sourcesWriter.put(record.source()); + destsWriter.put(record.dest()); + } + + @Override + public void close() throws IOException { + sourcesWriter.close(); + destsWriter.close(); + } + } +} diff --git a/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java new file mode 100644 index 00000000..820d0c7f --- /dev/null +++ b/code/processes/converting-process/model/java/nu/marginalia/model/processed/SlopDomainRecord.java @@ -0,0 +1,189 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.slop.SlopTable; +import nu.marginalia.slop.column.array.ObjectArrayColumn; +import nu.marginalia.slop.column.primitive.IntColumn; +import nu.marginalia.slop.column.string.EnumColumn; +import nu.marginalia.slop.column.string.TxtStringColumn; +import nu.marginalia.slop.desc.StorageType; + +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.List; +import java.util.function.Consumer; + +public record SlopDomainRecord( + String domain, + int knownUrls, + int goodUrls, + int visitedUrls, + String state, + String redirectDomain, + String ip, + List rssFeeds) +{ + + public record DomainWithIpProjection( + String domain, + String ip) + {} + + private static final TxtStringColumn domainsColumn = new TxtStringColumn("domain", StandardCharsets.UTF_8, StorageType.GZIP); + private static final EnumColumn statesColumn = new EnumColumn("state", StandardCharsets.US_ASCII, StorageType.PLAIN); + private static final TxtStringColumn redirectDomainsColumn = new TxtStringColumn("redirectDomain", StandardCharsets.UTF_8, StorageType.GZIP); + private static final TxtStringColumn ipColumn = new TxtStringColumn("ip", StandardCharsets.US_ASCII, StorageType.GZIP); + + private static final IntColumn knownUrlsColumn = new IntColumn("knownUrls", StorageType.PLAIN); + private static final IntColumn goodUrlsColumn = new IntColumn("goodUrls", StorageType.PLAIN); + private static final IntColumn visitedUrlsColumn = new IntColumn("visitedUrls", StorageType.PLAIN); + + private static final ObjectArrayColumn rssFeedsColumn = new TxtStringColumn("rssFeeds", StandardCharsets.UTF_8, StorageType.GZIP).asArray(); + + + public static class DomainNameReader extends SlopTable { + private final TxtStringColumn.Reader domainsReader; + + public DomainNameReader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + + public DomainNameReader(SlopTable.Ref ref) throws IOException { + super(ref); + + domainsReader = domainsColumn.open(this); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public String next() throws IOException { + return domainsReader.get(); + } + } + + public static class DomainWithIpReader extends SlopTable { + private final TxtStringColumn.Reader domainsReader; + private final TxtStringColumn.Reader ipReader; + + public DomainWithIpReader(SlopTable.Ref ref) throws IOException { + super(ref); + + domainsReader = domainsColumn.open(this); + ipReader = ipColumn.open(this); + } + + public DomainWithIpReader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public DomainWithIpProjection next() throws IOException { + + return new DomainWithIpProjection( + domainsReader.get(), + ipReader.get() + ); + } + } + + public static class Reader extends SlopTable { + private final TxtStringColumn.Reader domainsReader; + private final EnumColumn.Reader statesReader; + private final TxtStringColumn.Reader redirectReader; + private final TxtStringColumn.Reader ipReader; + + private final IntColumn.Reader knownUrlsReader; + private final IntColumn.Reader goodUrlsReader; + private final IntColumn.Reader visitedUrlsReader; + + private final ObjectArrayColumn.Reader rssFeedsReader; + + public Reader(SlopTable.Ref ref) throws IOException { + super(ref); + + domainsReader = domainsColumn.open(this); + statesReader = statesColumn.open(this); + redirectReader = redirectDomainsColumn.open(this); + ipReader = ipColumn.open(this); + + knownUrlsReader = knownUrlsColumn.open(this); + goodUrlsReader = goodUrlsColumn.open(this); + visitedUrlsReader = visitedUrlsColumn.open(this); + + rssFeedsReader = rssFeedsColumn.open(this); + } + + public Reader(Path baseDir, int page) throws IOException { + this(new Ref<>(baseDir, page)); + } + + public boolean hasMore() throws IOException { + return domainsReader.hasRemaining(); + } + + public void forEach(Consumer recordConsumer) throws IOException { + while (hasMore()) { + recordConsumer.accept(next()); + } + } + + public SlopDomainRecord next() throws IOException { + return new SlopDomainRecord( + domainsReader.get(), + knownUrlsReader.get(), + goodUrlsReader.get(), + visitedUrlsReader.get(), + statesReader.get(), + redirectReader.get(), + ipReader.get(), + rssFeedsReader.get() + ); + } + } + + public static class Writer extends SlopTable { + private final TxtStringColumn.Writer domainsWriter; + private final EnumColumn.Writer statesWriter; + private final TxtStringColumn.Writer redirectWriter; + private final TxtStringColumn.Writer ipWriter; + + private final IntColumn.Writer knownUrlsWriter; + private final IntColumn.Writer goodUrlsWriter; + private final IntColumn.Writer visitedUrlsWriter; + + private final ObjectArrayColumn.Writer rssFeedsWriter; + + public Writer(Path baseDir, int page) throws IOException { + super(baseDir, page); + + domainsWriter = domainsColumn.create(this); + statesWriter = statesColumn.create(this); + redirectWriter = redirectDomainsColumn.create(this); + ipWriter = ipColumn.create(this); + + knownUrlsWriter = knownUrlsColumn.create(this); + goodUrlsWriter = goodUrlsColumn.create(this); + visitedUrlsWriter = visitedUrlsColumn.create(this); + + rssFeedsWriter = rssFeedsColumn.create(this); + } + + public void write(SlopDomainRecord record) throws IOException { + domainsWriter.put(record.domain()); + statesWriter.put(record.state()); + redirectWriter.put(record.redirectDomain()); + ipWriter.put(record.ip()); + + knownUrlsWriter.put(record.knownUrls()); + goodUrlsWriter.put(record.goodUrls()); + visitedUrlsWriter.put(record.visitedUrls()); + + rssFeedsWriter.put(record.rssFeeds()); + } + } +} diff --git a/code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLog.java b/code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLog.java similarity index 100% rename from code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLog.java rename to code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLog.java diff --git a/code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogImpl.java b/code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogImpl.java similarity index 100% rename from code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogImpl.java rename to code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogImpl.java diff --git a/code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogInspector.java b/code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogInspector.java similarity index 100% rename from code/process-models/work-log/java/nu/marginalia/worklog/BatchingWorkLogInspector.java rename to code/processes/converting-process/model/java/nu/marginalia/worklog/BatchingWorkLogInspector.java diff --git a/code/processes/converting-process/model/readme.md b/code/processes/converting-process/model/readme.md new file mode 100644 index 00000000..ef2b50e4 --- /dev/null +++ b/code/processes/converting-process/model/readme.md @@ -0,0 +1,9 @@ +The processed-data package contains models and logic for +reading and writing [Slop](https://github.com/MarginaliaSearch/SlopData) tables with the output from the +[converting-process](../../converting-process). + +Main models: + +* [SlopDocumentRecord](java/nu/marginalia/model/processed/SlopDocumentRecord.java) +* [SlopDomainLinkRecord](java/nu/marginalia/model/processed/SlopDomainLinkRecord.java) +* [SlopDomainRecord](java/nu/marginalia/model/processed/SlopDomainRecord.java) diff --git a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java new file mode 100644 index 00000000..35195cc1 --- /dev/null +++ b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDocumentRecordTest.java @@ -0,0 +1,101 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.sequence.VarintCodedSequence; +import nu.marginalia.slop.SlopTable; +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + + +public class SlopDocumentRecordTest { + private Path testDir; + + @BeforeEach + void setUp() throws IOException { + testDir = Files.createTempDirectory(getClass().getSimpleName()); + } + + @AfterEach + void tearDown() throws IOException { + TestUtil.clearTempDir(testDir); + } + + @Test + public void test() throws IOException { + ByteBuffer workArea = ByteBuffer.allocate(1024); + var record = new SlopDocumentRecord("example.com", "https://example.com/foo", 1, "OK", "", + "test", + "testtest", + 1, + "HTML3", + 100, + 0xF00BAAL, + 0.5f, + 0xBEEFL, + null, + List.of("test1", "test2"), + new byte[] { 2, 3}, + List.of(VarintCodedSequence.generate(1, 3, 5), VarintCodedSequence.generate(2, 4, 6)), + new byte[] { 'a', 'b' }, + List.of(VarintCodedSequence.generate(2, 3, 5), VarintCodedSequence.generate(3, 4, 6)) + ); + + try (var writer = new SlopDocumentRecord.Writer(testDir, 0)) { + writer.write(record); + } + + try (var keywordReader = new SlopDocumentRecord.KeywordsProjectionReader(new SlopTable.Ref<>(testDir, 0))) { + assertTrue(keywordReader.hasMore()); + var readRecord = keywordReader.next(); + assertFalse(keywordReader.hasMore()); + + var expected = new SlopDocumentRecord.KeywordsProjection( + record.domain(), + record.ordinal(), + record.htmlFeatures(), + record.documentMetadata(), + record.length(), + record.words(), + record.metas(), + record.positions(), + record.spanCodes(), + record.spans() + ); + + Assertions.assertEquals(expected, readRecord); + } + + try (var docDataReader = new SlopDocumentRecord.MetadataReader(testDir, 0)) { + assertTrue(docDataReader.hasMore()); + var readRecord = docDataReader.next(); + assertFalse(docDataReader.hasMore()); + + var expected2 = new SlopDocumentRecord.MetadataProjection( + record.domain(), + record.url(), + record.ordinal(), + record.title(), + record.description(), + record.htmlFeatures(), + record.htmlStandard(), + record.length(), + record.hash(), + record.quality(), + record.pubYear() + ); + + Assertions.assertEquals(expected2, readRecord); + } + } +} diff --git a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainLinkRecordTest.java b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainLinkRecordTest.java new file mode 100644 index 00000000..19979f71 --- /dev/null +++ b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainLinkRecordTest.java @@ -0,0 +1,43 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; + +import static org.junit.jupiter.api.Assertions.*; + +class SlopDomainLinkRecordTest { + private Path testDir; + + @BeforeEach + void setUp() throws IOException { + testDir = Files.createTempDirectory(getClass().getSimpleName()); + } + + @AfterEach + void tearDown() { + TestUtil.clearTempDir(testDir); + } + + @Test + public void test() throws IOException { + var record = new SlopDomainLinkRecord("source", "dest"); + + try (var writer = new SlopDomainLinkRecord.Writer(testDir, 0)) { + writer.write(record); + } + + try (var reader = new SlopDomainLinkRecord.Reader(testDir, 0)) { + assertTrue(reader.hasMore()); + var readRecord = reader.next(); + assertFalse(reader.hasMore()); + + assertEquals(record, readRecord); + } + } +} \ No newline at end of file diff --git a/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java new file mode 100644 index 00000000..f4d7e0f0 --- /dev/null +++ b/code/processes/converting-process/model/test/nu/marginalia/model/processed/SlopDomainRecordTest.java @@ -0,0 +1,70 @@ +package nu.marginalia.model.processed; + +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class SlopDomainRecordTest { + + private Path testDir; + + @BeforeEach + void setUp() throws IOException { + testDir = Files.createTempDirectory(getClass().getSimpleName()); + } + + @AfterEach + void tearDown() throws IOException { + TestUtil.clearTempDir(testDir); + } + + @Test + public void testWriteRead() throws IOException { + var record = new SlopDomainRecord( + "domain", + 1, 2, 3, + "state", + "redirectDomain", + "192.168.0.1", + List.of("rss1", "rss2") + ); + + try (var writer = new SlopDomainRecord.Writer(testDir, 0)) { + writer.write(record); + } + + try (var reader = new SlopDomainRecord.Reader(testDir, 0)) { + assertTrue(reader.hasMore()); + var readRecord = reader.next(); + assertFalse(reader.hasMore()); + + Assertions.assertEquals(record, readRecord); + } + + try (var dwrReader = new SlopDomainRecord.DomainWithIpReader(testDir, 0)) { + assertTrue(dwrReader.hasMore()); + var readRecord = dwrReader.next(); + assertFalse(dwrReader.hasMore()); + + Assertions.assertEquals(new SlopDomainRecord.DomainWithIpProjection("domain", "192.168.0.1"), readRecord); + } + + try (var dnReader = new SlopDomainRecord.DomainNameReader(testDir, 0)) { + assertTrue(dnReader.hasMore()); + var readRecord = dnReader.next(); + assertFalse(dnReader.hasMore()); + + Assertions.assertEquals("domain", readRecord); + } + } +} \ No newline at end of file diff --git a/code/process-models/work-log/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java b/code/processes/converting-process/model/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java similarity index 100% rename from code/process-models/work-log/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java rename to code/processes/converting-process/model/test/nu/marginalia/worklog/BatchingWorkLogImplTest.java diff --git a/code/features-convert/reddit-json/resources/db/reddit.sql b/code/processes/converting-process/resources/db/reddit.sql similarity index 100% rename from code/features-convert/reddit-json/resources/db/reddit.sql rename to code/processes/converting-process/resources/db/reddit.sql diff --git a/code/features-convert/stackexchange-xml/resources/db/stackexchange.sql b/code/processes/converting-process/resources/db/stackexchange.sql similarity index 100% rename from code/features-convert/stackexchange-xml/resources/db/stackexchange.sql rename to code/processes/converting-process/resources/db/stackexchange.sql diff --git a/code/features-convert/summary-extraction/test-resources/html/monadnock.html b/code/processes/converting-process/test-resources/html/monadnock.html similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/monadnock.html rename to code/processes/converting-process/test-resources/html/monadnock.html diff --git a/code/features-convert/summary-extraction/test-resources/html/readme.md b/code/processes/converting-process/test-resources/html/readme.md similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/readme.md rename to code/processes/converting-process/test-resources/html/readme.md diff --git a/code/features-convert/summary-extraction/test-resources/html/summarization/187.shtml b/code/processes/converting-process/test-resources/html/summarization/187.shtml similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/summarization/187.shtml rename to code/processes/converting-process/test-resources/html/summarization/187.shtml diff --git a/code/features-convert/summary-extraction/test-resources/html/summarization/surrey.html b/code/processes/converting-process/test-resources/html/summarization/surrey.html similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/summarization/surrey.html rename to code/processes/converting-process/test-resources/html/summarization/surrey.html diff --git a/code/features-convert/summary-extraction/test-resources/html/summarization/surrey.html.1 b/code/processes/converting-process/test-resources/html/summarization/surrey.html.1 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/summarization/surrey.html.1 rename to code/processes/converting-process/test-resources/html/summarization/surrey.html.1 diff --git a/code/features-convert/summary-extraction/test-resources/html/theregister.html b/code/processes/converting-process/test-resources/html/theregister.html similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/theregister.html rename to code/processes/converting-process/test-resources/html/theregister.html diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/index b/code/processes/converting-process/test-resources/html/work-set/index similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/index rename to code/processes/converting-process/test-resources/html/work-set/index diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1021546012 b/code/processes/converting-process/test-resources/html/work-set/url--1021546012 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1021546012 rename to code/processes/converting-process/test-resources/html/work-set/url--1021546012 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1028592943 b/code/processes/converting-process/test-resources/html/work-set/url--1028592943 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1028592943 rename to code/processes/converting-process/test-resources/html/work-set/url--1028592943 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1081293162 b/code/processes/converting-process/test-resources/html/work-set/url--1081293162 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1081293162 rename to code/processes/converting-process/test-resources/html/work-set/url--1081293162 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1105046394 b/code/processes/converting-process/test-resources/html/work-set/url--1105046394 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1105046394 rename to code/processes/converting-process/test-resources/html/work-set/url--1105046394 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1146923296 b/code/processes/converting-process/test-resources/html/work-set/url--1146923296 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1146923296 rename to code/processes/converting-process/test-resources/html/work-set/url--1146923296 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1194694074 b/code/processes/converting-process/test-resources/html/work-set/url--1194694074 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1194694074 rename to code/processes/converting-process/test-resources/html/work-set/url--1194694074 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1207898281 b/code/processes/converting-process/test-resources/html/work-set/url--1207898281 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1207898281 rename to code/processes/converting-process/test-resources/html/work-set/url--1207898281 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1268145073 b/code/processes/converting-process/test-resources/html/work-set/url--1268145073 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1268145073 rename to code/processes/converting-process/test-resources/html/work-set/url--1268145073 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1294876331 b/code/processes/converting-process/test-resources/html/work-set/url--1294876331 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1294876331 rename to code/processes/converting-process/test-resources/html/work-set/url--1294876331 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1314767420 b/code/processes/converting-process/test-resources/html/work-set/url--1314767420 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1314767420 rename to code/processes/converting-process/test-resources/html/work-set/url--1314767420 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1316269786 b/code/processes/converting-process/test-resources/html/work-set/url--1316269786 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1316269786 rename to code/processes/converting-process/test-resources/html/work-set/url--1316269786 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1316766580 b/code/processes/converting-process/test-resources/html/work-set/url--1316766580 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1316766580 rename to code/processes/converting-process/test-resources/html/work-set/url--1316766580 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1319968043 b/code/processes/converting-process/test-resources/html/work-set/url--1319968043 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1319968043 rename to code/processes/converting-process/test-resources/html/work-set/url--1319968043 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1338576987 b/code/processes/converting-process/test-resources/html/work-set/url--1338576987 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1338576987 rename to code/processes/converting-process/test-resources/html/work-set/url--1338576987 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1341909571 b/code/processes/converting-process/test-resources/html/work-set/url--1341909571 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1341909571 rename to code/processes/converting-process/test-resources/html/work-set/url--1341909571 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1369578579 b/code/processes/converting-process/test-resources/html/work-set/url--1369578579 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1369578579 rename to code/processes/converting-process/test-resources/html/work-set/url--1369578579 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1437315645 b/code/processes/converting-process/test-resources/html/work-set/url--1437315645 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1437315645 rename to code/processes/converting-process/test-resources/html/work-set/url--1437315645 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1458954960 b/code/processes/converting-process/test-resources/html/work-set/url--1458954960 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1458954960 rename to code/processes/converting-process/test-resources/html/work-set/url--1458954960 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1475681345 b/code/processes/converting-process/test-resources/html/work-set/url--1475681345 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1475681345 rename to code/processes/converting-process/test-resources/html/work-set/url--1475681345 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1498328446 b/code/processes/converting-process/test-resources/html/work-set/url--1498328446 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1498328446 rename to code/processes/converting-process/test-resources/html/work-set/url--1498328446 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1507779664 b/code/processes/converting-process/test-resources/html/work-set/url--1507779664 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1507779664 rename to code/processes/converting-process/test-resources/html/work-set/url--1507779664 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1540303379 b/code/processes/converting-process/test-resources/html/work-set/url--1540303379 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1540303379 rename to code/processes/converting-process/test-resources/html/work-set/url--1540303379 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--154898476 b/code/processes/converting-process/test-resources/html/work-set/url--154898476 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--154898476 rename to code/processes/converting-process/test-resources/html/work-set/url--154898476 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1552059399 b/code/processes/converting-process/test-resources/html/work-set/url--1552059399 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1552059399 rename to code/processes/converting-process/test-resources/html/work-set/url--1552059399 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1557688340 b/code/processes/converting-process/test-resources/html/work-set/url--1557688340 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1557688340 rename to code/processes/converting-process/test-resources/html/work-set/url--1557688340 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1584145751 b/code/processes/converting-process/test-resources/html/work-set/url--1584145751 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1584145751 rename to code/processes/converting-process/test-resources/html/work-set/url--1584145751 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1605151204 b/code/processes/converting-process/test-resources/html/work-set/url--1605151204 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1605151204 rename to code/processes/converting-process/test-resources/html/work-set/url--1605151204 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--162269247 b/code/processes/converting-process/test-resources/html/work-set/url--162269247 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--162269247 rename to code/processes/converting-process/test-resources/html/work-set/url--162269247 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1624294488 b/code/processes/converting-process/test-resources/html/work-set/url--1624294488 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1624294488 rename to code/processes/converting-process/test-resources/html/work-set/url--1624294488 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--164108285 b/code/processes/converting-process/test-resources/html/work-set/url--164108285 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--164108285 rename to code/processes/converting-process/test-resources/html/work-set/url--164108285 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1645688243 b/code/processes/converting-process/test-resources/html/work-set/url--1645688243 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1645688243 rename to code/processes/converting-process/test-resources/html/work-set/url--1645688243 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1658004609 b/code/processes/converting-process/test-resources/html/work-set/url--1658004609 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1658004609 rename to code/processes/converting-process/test-resources/html/work-set/url--1658004609 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1658558834 b/code/processes/converting-process/test-resources/html/work-set/url--1658558834 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1658558834 rename to code/processes/converting-process/test-resources/html/work-set/url--1658558834 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1698664879 b/code/processes/converting-process/test-resources/html/work-set/url--1698664879 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1698664879 rename to code/processes/converting-process/test-resources/html/work-set/url--1698664879 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--169975195 b/code/processes/converting-process/test-resources/html/work-set/url--169975195 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--169975195 rename to code/processes/converting-process/test-resources/html/work-set/url--169975195 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1701203332 b/code/processes/converting-process/test-resources/html/work-set/url--1701203332 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1701203332 rename to code/processes/converting-process/test-resources/html/work-set/url--1701203332 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--17281998 b/code/processes/converting-process/test-resources/html/work-set/url--17281998 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--17281998 rename to code/processes/converting-process/test-resources/html/work-set/url--17281998 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1742070028 b/code/processes/converting-process/test-resources/html/work-set/url--1742070028 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1742070028 rename to code/processes/converting-process/test-resources/html/work-set/url--1742070028 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1745376814 b/code/processes/converting-process/test-resources/html/work-set/url--1745376814 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1745376814 rename to code/processes/converting-process/test-resources/html/work-set/url--1745376814 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1749889035 b/code/processes/converting-process/test-resources/html/work-set/url--1749889035 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1749889035 rename to code/processes/converting-process/test-resources/html/work-set/url--1749889035 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--176177364 b/code/processes/converting-process/test-resources/html/work-set/url--176177364 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--176177364 rename to code/processes/converting-process/test-resources/html/work-set/url--176177364 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--177014197 b/code/processes/converting-process/test-resources/html/work-set/url--177014197 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--177014197 rename to code/processes/converting-process/test-resources/html/work-set/url--177014197 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1794527707 b/code/processes/converting-process/test-resources/html/work-set/url--1794527707 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1794527707 rename to code/processes/converting-process/test-resources/html/work-set/url--1794527707 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1797740201 b/code/processes/converting-process/test-resources/html/work-set/url--1797740201 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1797740201 rename to code/processes/converting-process/test-resources/html/work-set/url--1797740201 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1799098579 b/code/processes/converting-process/test-resources/html/work-set/url--1799098579 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1799098579 rename to code/processes/converting-process/test-resources/html/work-set/url--1799098579 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1959637826 b/code/processes/converting-process/test-resources/html/work-set/url--1959637826 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1959637826 rename to code/processes/converting-process/test-resources/html/work-set/url--1959637826 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1971916964 b/code/processes/converting-process/test-resources/html/work-set/url--1971916964 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1971916964 rename to code/processes/converting-process/test-resources/html/work-set/url--1971916964 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--1985840368 b/code/processes/converting-process/test-resources/html/work-set/url--1985840368 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--1985840368 rename to code/processes/converting-process/test-resources/html/work-set/url--1985840368 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--2012610859 b/code/processes/converting-process/test-resources/html/work-set/url--2012610859 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--2012610859 rename to code/processes/converting-process/test-resources/html/work-set/url--2012610859 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--202178680 b/code/processes/converting-process/test-resources/html/work-set/url--202178680 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--202178680 rename to code/processes/converting-process/test-resources/html/work-set/url--202178680 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--2043528727 b/code/processes/converting-process/test-resources/html/work-set/url--2043528727 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--2043528727 rename to code/processes/converting-process/test-resources/html/work-set/url--2043528727 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--2081757477 b/code/processes/converting-process/test-resources/html/work-set/url--2081757477 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--2081757477 rename to code/processes/converting-process/test-resources/html/work-set/url--2081757477 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--2103982576 b/code/processes/converting-process/test-resources/html/work-set/url--2103982576 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--2103982576 rename to code/processes/converting-process/test-resources/html/work-set/url--2103982576 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--2111558769 b/code/processes/converting-process/test-resources/html/work-set/url--2111558769 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--2111558769 rename to code/processes/converting-process/test-resources/html/work-set/url--2111558769 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--213168798 b/code/processes/converting-process/test-resources/html/work-set/url--213168798 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--213168798 rename to code/processes/converting-process/test-resources/html/work-set/url--213168798 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--232544032 b/code/processes/converting-process/test-resources/html/work-set/url--232544032 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--232544032 rename to code/processes/converting-process/test-resources/html/work-set/url--232544032 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--253010011 b/code/processes/converting-process/test-resources/html/work-set/url--253010011 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--253010011 rename to code/processes/converting-process/test-resources/html/work-set/url--253010011 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--274250994 b/code/processes/converting-process/test-resources/html/work-set/url--274250994 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--274250994 rename to code/processes/converting-process/test-resources/html/work-set/url--274250994 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--332442790 b/code/processes/converting-process/test-resources/html/work-set/url--332442790 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--332442790 rename to code/processes/converting-process/test-resources/html/work-set/url--332442790 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--353437903 b/code/processes/converting-process/test-resources/html/work-set/url--353437903 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--353437903 rename to code/processes/converting-process/test-resources/html/work-set/url--353437903 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--364546777 b/code/processes/converting-process/test-resources/html/work-set/url--364546777 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--364546777 rename to code/processes/converting-process/test-resources/html/work-set/url--364546777 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--379129416 b/code/processes/converting-process/test-resources/html/work-set/url--379129416 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--379129416 rename to code/processes/converting-process/test-resources/html/work-set/url--379129416 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--399428149 b/code/processes/converting-process/test-resources/html/work-set/url--399428149 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--399428149 rename to code/processes/converting-process/test-resources/html/work-set/url--399428149 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--425233170 b/code/processes/converting-process/test-resources/html/work-set/url--425233170 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--425233170 rename to code/processes/converting-process/test-resources/html/work-set/url--425233170 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--434612307 b/code/processes/converting-process/test-resources/html/work-set/url--434612307 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--434612307 rename to code/processes/converting-process/test-resources/html/work-set/url--434612307 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--439772328 b/code/processes/converting-process/test-resources/html/work-set/url--439772328 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--439772328 rename to code/processes/converting-process/test-resources/html/work-set/url--439772328 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--458002611 b/code/processes/converting-process/test-resources/html/work-set/url--458002611 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--458002611 rename to code/processes/converting-process/test-resources/html/work-set/url--458002611 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--506010305 b/code/processes/converting-process/test-resources/html/work-set/url--506010305 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--506010305 rename to code/processes/converting-process/test-resources/html/work-set/url--506010305 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--546773534 b/code/processes/converting-process/test-resources/html/work-set/url--546773534 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--546773534 rename to code/processes/converting-process/test-resources/html/work-set/url--546773534 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--551288516 b/code/processes/converting-process/test-resources/html/work-set/url--551288516 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--551288516 rename to code/processes/converting-process/test-resources/html/work-set/url--551288516 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--602577763 b/code/processes/converting-process/test-resources/html/work-set/url--602577763 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--602577763 rename to code/processes/converting-process/test-resources/html/work-set/url--602577763 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--611668054 b/code/processes/converting-process/test-resources/html/work-set/url--611668054 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--611668054 rename to code/processes/converting-process/test-resources/html/work-set/url--611668054 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--634771245 b/code/processes/converting-process/test-resources/html/work-set/url--634771245 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--634771245 rename to code/processes/converting-process/test-resources/html/work-set/url--634771245 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--639320493 b/code/processes/converting-process/test-resources/html/work-set/url--639320493 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--639320493 rename to code/processes/converting-process/test-resources/html/work-set/url--639320493 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--643179018 b/code/processes/converting-process/test-resources/html/work-set/url--643179018 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--643179018 rename to code/processes/converting-process/test-resources/html/work-set/url--643179018 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--663772351 b/code/processes/converting-process/test-resources/html/work-set/url--663772351 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--663772351 rename to code/processes/converting-process/test-resources/html/work-set/url--663772351 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--670789152 b/code/processes/converting-process/test-resources/html/work-set/url--670789152 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--670789152 rename to code/processes/converting-process/test-resources/html/work-set/url--670789152 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--6797317 b/code/processes/converting-process/test-resources/html/work-set/url--6797317 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--6797317 rename to code/processes/converting-process/test-resources/html/work-set/url--6797317 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--700978490 b/code/processes/converting-process/test-resources/html/work-set/url--700978490 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--700978490 rename to code/processes/converting-process/test-resources/html/work-set/url--700978490 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--708035332 b/code/processes/converting-process/test-resources/html/work-set/url--708035332 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--708035332 rename to code/processes/converting-process/test-resources/html/work-set/url--708035332 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--804917062 b/code/processes/converting-process/test-resources/html/work-set/url--804917062 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--804917062 rename to code/processes/converting-process/test-resources/html/work-set/url--804917062 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--819771302 b/code/processes/converting-process/test-resources/html/work-set/url--819771302 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--819771302 rename to code/processes/converting-process/test-resources/html/work-set/url--819771302 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--840796372 b/code/processes/converting-process/test-resources/html/work-set/url--840796372 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--840796372 rename to code/processes/converting-process/test-resources/html/work-set/url--840796372 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--841445362 b/code/processes/converting-process/test-resources/html/work-set/url--841445362 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--841445362 rename to code/processes/converting-process/test-resources/html/work-set/url--841445362 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--862385354 b/code/processes/converting-process/test-resources/html/work-set/url--862385354 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--862385354 rename to code/processes/converting-process/test-resources/html/work-set/url--862385354 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--879796466 b/code/processes/converting-process/test-resources/html/work-set/url--879796466 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--879796466 rename to code/processes/converting-process/test-resources/html/work-set/url--879796466 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--89134993 b/code/processes/converting-process/test-resources/html/work-set/url--89134993 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--89134993 rename to code/processes/converting-process/test-resources/html/work-set/url--89134993 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--905197876 b/code/processes/converting-process/test-resources/html/work-set/url--905197876 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--905197876 rename to code/processes/converting-process/test-resources/html/work-set/url--905197876 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--920328354 b/code/processes/converting-process/test-resources/html/work-set/url--920328354 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--920328354 rename to code/processes/converting-process/test-resources/html/work-set/url--920328354 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--952827759 b/code/processes/converting-process/test-resources/html/work-set/url--952827759 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--952827759 rename to code/processes/converting-process/test-resources/html/work-set/url--952827759 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--964018507 b/code/processes/converting-process/test-resources/html/work-set/url--964018507 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--964018507 rename to code/processes/converting-process/test-resources/html/work-set/url--964018507 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url--972614909 b/code/processes/converting-process/test-resources/html/work-set/url--972614909 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url--972614909 rename to code/processes/converting-process/test-resources/html/work-set/url--972614909 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-10088520 b/code/processes/converting-process/test-resources/html/work-set/url-10088520 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-10088520 rename to code/processes/converting-process/test-resources/html/work-set/url-10088520 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1013281103 b/code/processes/converting-process/test-resources/html/work-set/url-1013281103 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1013281103 rename to code/processes/converting-process/test-resources/html/work-set/url-1013281103 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1019241851 b/code/processes/converting-process/test-resources/html/work-set/url-1019241851 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1019241851 rename to code/processes/converting-process/test-resources/html/work-set/url-1019241851 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1059944953 b/code/processes/converting-process/test-resources/html/work-set/url-1059944953 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1059944953 rename to code/processes/converting-process/test-resources/html/work-set/url-1059944953 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1118681302 b/code/processes/converting-process/test-resources/html/work-set/url-1118681302 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1118681302 rename to code/processes/converting-process/test-resources/html/work-set/url-1118681302 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1179298706 b/code/processes/converting-process/test-resources/html/work-set/url-1179298706 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1179298706 rename to code/processes/converting-process/test-resources/html/work-set/url-1179298706 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1191749784 b/code/processes/converting-process/test-resources/html/work-set/url-1191749784 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1191749784 rename to code/processes/converting-process/test-resources/html/work-set/url-1191749784 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1207094790 b/code/processes/converting-process/test-resources/html/work-set/url-1207094790 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1207094790 rename to code/processes/converting-process/test-resources/html/work-set/url-1207094790 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1213989666 b/code/processes/converting-process/test-resources/html/work-set/url-1213989666 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1213989666 rename to code/processes/converting-process/test-resources/html/work-set/url-1213989666 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1222442301 b/code/processes/converting-process/test-resources/html/work-set/url-1222442301 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1222442301 rename to code/processes/converting-process/test-resources/html/work-set/url-1222442301 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-130332455 b/code/processes/converting-process/test-resources/html/work-set/url-130332455 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-130332455 rename to code/processes/converting-process/test-resources/html/work-set/url-130332455 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1311055461 b/code/processes/converting-process/test-resources/html/work-set/url-1311055461 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1311055461 rename to code/processes/converting-process/test-resources/html/work-set/url-1311055461 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1391842722 b/code/processes/converting-process/test-resources/html/work-set/url-1391842722 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1391842722 rename to code/processes/converting-process/test-resources/html/work-set/url-1391842722 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1457388763 b/code/processes/converting-process/test-resources/html/work-set/url-1457388763 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1457388763 rename to code/processes/converting-process/test-resources/html/work-set/url-1457388763 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1506356272 b/code/processes/converting-process/test-resources/html/work-set/url-1506356272 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1506356272 rename to code/processes/converting-process/test-resources/html/work-set/url-1506356272 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1511762169 b/code/processes/converting-process/test-resources/html/work-set/url-1511762169 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1511762169 rename to code/processes/converting-process/test-resources/html/work-set/url-1511762169 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1534640058 b/code/processes/converting-process/test-resources/html/work-set/url-1534640058 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1534640058 rename to code/processes/converting-process/test-resources/html/work-set/url-1534640058 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1551513871 b/code/processes/converting-process/test-resources/html/work-set/url-1551513871 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1551513871 rename to code/processes/converting-process/test-resources/html/work-set/url-1551513871 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1567632447 b/code/processes/converting-process/test-resources/html/work-set/url-1567632447 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1567632447 rename to code/processes/converting-process/test-resources/html/work-set/url-1567632447 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1623049502 b/code/processes/converting-process/test-resources/html/work-set/url-1623049502 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1623049502 rename to code/processes/converting-process/test-resources/html/work-set/url-1623049502 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-163919330 b/code/processes/converting-process/test-resources/html/work-set/url-163919330 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-163919330 rename to code/processes/converting-process/test-resources/html/work-set/url-163919330 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1661398327 b/code/processes/converting-process/test-resources/html/work-set/url-1661398327 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1661398327 rename to code/processes/converting-process/test-resources/html/work-set/url-1661398327 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1724309925 b/code/processes/converting-process/test-resources/html/work-set/url-1724309925 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1724309925 rename to code/processes/converting-process/test-resources/html/work-set/url-1724309925 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1736807128 b/code/processes/converting-process/test-resources/html/work-set/url-1736807128 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1736807128 rename to code/processes/converting-process/test-resources/html/work-set/url-1736807128 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1739031345 b/code/processes/converting-process/test-resources/html/work-set/url-1739031345 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1739031345 rename to code/processes/converting-process/test-resources/html/work-set/url-1739031345 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1755745765 b/code/processes/converting-process/test-resources/html/work-set/url-1755745765 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1755745765 rename to code/processes/converting-process/test-resources/html/work-set/url-1755745765 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1802811100 b/code/processes/converting-process/test-resources/html/work-set/url-1802811100 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1802811100 rename to code/processes/converting-process/test-resources/html/work-set/url-1802811100 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1805364707 b/code/processes/converting-process/test-resources/html/work-set/url-1805364707 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1805364707 rename to code/processes/converting-process/test-resources/html/work-set/url-1805364707 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1832702370 b/code/processes/converting-process/test-resources/html/work-set/url-1832702370 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1832702370 rename to code/processes/converting-process/test-resources/html/work-set/url-1832702370 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1853114311 b/code/processes/converting-process/test-resources/html/work-set/url-1853114311 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1853114311 rename to code/processes/converting-process/test-resources/html/work-set/url-1853114311 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1924872844 b/code/processes/converting-process/test-resources/html/work-set/url-1924872844 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1924872844 rename to code/processes/converting-process/test-resources/html/work-set/url-1924872844 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-197772804 b/code/processes/converting-process/test-resources/html/work-set/url-197772804 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-197772804 rename to code/processes/converting-process/test-resources/html/work-set/url-197772804 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1984259912 b/code/processes/converting-process/test-resources/html/work-set/url-1984259912 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1984259912 rename to code/processes/converting-process/test-resources/html/work-set/url-1984259912 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-1990903988 b/code/processes/converting-process/test-resources/html/work-set/url-1990903988 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-1990903988 rename to code/processes/converting-process/test-resources/html/work-set/url-1990903988 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2039310951 b/code/processes/converting-process/test-resources/html/work-set/url-2039310951 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2039310951 rename to code/processes/converting-process/test-resources/html/work-set/url-2039310951 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2040857056 b/code/processes/converting-process/test-resources/html/work-set/url-2040857056 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2040857056 rename to code/processes/converting-process/test-resources/html/work-set/url-2040857056 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2052613093 b/code/processes/converting-process/test-resources/html/work-set/url-2052613093 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2052613093 rename to code/processes/converting-process/test-resources/html/work-set/url-2052613093 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2063899866 b/code/processes/converting-process/test-resources/html/work-set/url-2063899866 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2063899866 rename to code/processes/converting-process/test-resources/html/work-set/url-2063899866 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2115548255 b/code/processes/converting-process/test-resources/html/work-set/url-2115548255 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2115548255 rename to code/processes/converting-process/test-resources/html/work-set/url-2115548255 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2127148436 b/code/processes/converting-process/test-resources/html/work-set/url-2127148436 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2127148436 rename to code/processes/converting-process/test-resources/html/work-set/url-2127148436 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-2133781904 b/code/processes/converting-process/test-resources/html/work-set/url-2133781904 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-2133781904 rename to code/processes/converting-process/test-resources/html/work-set/url-2133781904 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-225690385 b/code/processes/converting-process/test-resources/html/work-set/url-225690385 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-225690385 rename to code/processes/converting-process/test-resources/html/work-set/url-225690385 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-226401955 b/code/processes/converting-process/test-resources/html/work-set/url-226401955 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-226401955 rename to code/processes/converting-process/test-resources/html/work-set/url-226401955 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-262970770 b/code/processes/converting-process/test-resources/html/work-set/url-262970770 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-262970770 rename to code/processes/converting-process/test-resources/html/work-set/url-262970770 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-30106798 b/code/processes/converting-process/test-resources/html/work-set/url-30106798 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-30106798 rename to code/processes/converting-process/test-resources/html/work-set/url-30106798 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-302167335 b/code/processes/converting-process/test-resources/html/work-set/url-302167335 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-302167335 rename to code/processes/converting-process/test-resources/html/work-set/url-302167335 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-327999153 b/code/processes/converting-process/test-resources/html/work-set/url-327999153 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-327999153 rename to code/processes/converting-process/test-resources/html/work-set/url-327999153 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-332568225 b/code/processes/converting-process/test-resources/html/work-set/url-332568225 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-332568225 rename to code/processes/converting-process/test-resources/html/work-set/url-332568225 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-343223418 b/code/processes/converting-process/test-resources/html/work-set/url-343223418 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-343223418 rename to code/processes/converting-process/test-resources/html/work-set/url-343223418 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-383103932 b/code/processes/converting-process/test-resources/html/work-set/url-383103932 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-383103932 rename to code/processes/converting-process/test-resources/html/work-set/url-383103932 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-412929678 b/code/processes/converting-process/test-resources/html/work-set/url-412929678 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-412929678 rename to code/processes/converting-process/test-resources/html/work-set/url-412929678 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-475213997 b/code/processes/converting-process/test-resources/html/work-set/url-475213997 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-475213997 rename to code/processes/converting-process/test-resources/html/work-set/url-475213997 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-483403121 b/code/processes/converting-process/test-resources/html/work-set/url-483403121 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-483403121 rename to code/processes/converting-process/test-resources/html/work-set/url-483403121 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-488667993 b/code/processes/converting-process/test-resources/html/work-set/url-488667993 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-488667993 rename to code/processes/converting-process/test-resources/html/work-set/url-488667993 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-50815201 b/code/processes/converting-process/test-resources/html/work-set/url-50815201 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-50815201 rename to code/processes/converting-process/test-resources/html/work-set/url-50815201 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-522685905 b/code/processes/converting-process/test-resources/html/work-set/url-522685905 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-522685905 rename to code/processes/converting-process/test-resources/html/work-set/url-522685905 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-570714305 b/code/processes/converting-process/test-resources/html/work-set/url-570714305 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-570714305 rename to code/processes/converting-process/test-resources/html/work-set/url-570714305 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-58733529 b/code/processes/converting-process/test-resources/html/work-set/url-58733529 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-58733529 rename to code/processes/converting-process/test-resources/html/work-set/url-58733529 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-616518304 b/code/processes/converting-process/test-resources/html/work-set/url-616518304 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-616518304 rename to code/processes/converting-process/test-resources/html/work-set/url-616518304 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-662169426 b/code/processes/converting-process/test-resources/html/work-set/url-662169426 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-662169426 rename to code/processes/converting-process/test-resources/html/work-set/url-662169426 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-677278788 b/code/processes/converting-process/test-resources/html/work-set/url-677278788 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-677278788 rename to code/processes/converting-process/test-resources/html/work-set/url-677278788 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-690486170 b/code/processes/converting-process/test-resources/html/work-set/url-690486170 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-690486170 rename to code/processes/converting-process/test-resources/html/work-set/url-690486170 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-709693331 b/code/processes/converting-process/test-resources/html/work-set/url-709693331 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-709693331 rename to code/processes/converting-process/test-resources/html/work-set/url-709693331 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-734531556 b/code/processes/converting-process/test-resources/html/work-set/url-734531556 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-734531556 rename to code/processes/converting-process/test-resources/html/work-set/url-734531556 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-767530276 b/code/processes/converting-process/test-resources/html/work-set/url-767530276 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-767530276 rename to code/processes/converting-process/test-resources/html/work-set/url-767530276 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-783154014 b/code/processes/converting-process/test-resources/html/work-set/url-783154014 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-783154014 rename to code/processes/converting-process/test-resources/html/work-set/url-783154014 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-796905237 b/code/processes/converting-process/test-resources/html/work-set/url-796905237 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-796905237 rename to code/processes/converting-process/test-resources/html/work-set/url-796905237 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-800099955 b/code/processes/converting-process/test-resources/html/work-set/url-800099955 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-800099955 rename to code/processes/converting-process/test-resources/html/work-set/url-800099955 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-804101946 b/code/processes/converting-process/test-resources/html/work-set/url-804101946 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-804101946 rename to code/processes/converting-process/test-resources/html/work-set/url-804101946 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-830664902 b/code/processes/converting-process/test-resources/html/work-set/url-830664902 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-830664902 rename to code/processes/converting-process/test-resources/html/work-set/url-830664902 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-876060686 b/code/processes/converting-process/test-resources/html/work-set/url-876060686 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-876060686 rename to code/processes/converting-process/test-resources/html/work-set/url-876060686 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-892584998 b/code/processes/converting-process/test-resources/html/work-set/url-892584998 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-892584998 rename to code/processes/converting-process/test-resources/html/work-set/url-892584998 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-942458463 b/code/processes/converting-process/test-resources/html/work-set/url-942458463 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-942458463 rename to code/processes/converting-process/test-resources/html/work-set/url-942458463 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-952036171 b/code/processes/converting-process/test-resources/html/work-set/url-952036171 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-952036171 rename to code/processes/converting-process/test-resources/html/work-set/url-952036171 diff --git a/code/features-convert/summary-extraction/test-resources/html/work-set/url-968207276 b/code/processes/converting-process/test-resources/html/work-set/url-968207276 similarity index 100% rename from code/features-convert/summary-extraction/test-resources/html/work-set/url-968207276 rename to code/processes/converting-process/test-resources/html/work-set/url-968207276 diff --git a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java index 61de3c38..06b839eb 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/ConvertingIntegrationTest.java @@ -3,21 +3,21 @@ package nu.marginalia.converting; import com.google.inject.Guice; import com.google.inject.Injector; -import nu.marginalia.model.html.HtmlStandard; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.UrlIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; +import nu.marginalia.model.html.HtmlStandard; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import java.io.*; +import java.io.IOException; import java.nio.file.Path; import java.time.LocalTime; import java.util.*; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java index 85651501..0e935276 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/CrawlingThenConvertingIntegrationTest.java @@ -12,14 +12,14 @@ import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawl.DomainIndexingState; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java index 355921ea..a9b60211 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/JavadocSpecializationTest.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java index 7aab1759..6d72bb51 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/LemmySpecializationTest.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/WikiSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/WikiSpecializationTest.java index 1fc23148..63d43296 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/WikiSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/WikiSpecializationTest.java @@ -1,6 +1,6 @@ package nu.marginalia.converting.processor.plugin.specialization; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; diff --git a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java index 40914ba8..581dea3c 100644 --- a/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/plugin/specialization/XenForoSpecializationTest.java @@ -1,7 +1,7 @@ package nu.marginalia.converting.processor.plugin.specialization; import nu.marginalia.converting.processor.logic.DocumentGeneratorExtractor; -import nu.marginalia.summary.SummaryExtractor; +import nu.marginalia.converting.processor.summary.SummaryExtractor; import nu.marginalia.test.CommonTestData; import org.jsoup.Jsoup; import org.junit.jupiter.api.BeforeAll; diff --git a/code/features-convert/pubdate/test/nu/marginalia/pubdate/PubDateSnifferTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java similarity index 98% rename from code/features-convert/pubdate/test/nu/marginalia/pubdate/PubDateSnifferTest.java rename to code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java index efd320e8..c0ef172c 100644 --- a/code/features-convert/pubdate/test/nu/marginalia/pubdate/PubDateSnifferTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateSnifferTest.java @@ -1,9 +1,11 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.pubdate.PubDateParser; +import nu.marginalia.converting.processor.pubdate.PubDateSniffer; +import nu.marginalia.converting.processor.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.html.HtmlStandard; -import nu.marginalia.pubdate.heuristic.PubDateHeuristicDOMParsingPass2; import org.jsoup.Jsoup; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/pubdate/test/nu/marginalia/pubdate/PubDateTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateTest.java similarity index 88% rename from code/features-convert/pubdate/test/nu/marginalia/pubdate/PubDateTest.java rename to code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateTest.java index 64bd1f73..a9eb5cb3 100644 --- a/code/features-convert/pubdate/test/nu/marginalia/pubdate/PubDateTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/pubdate/PubDateTest.java @@ -1,4 +1,4 @@ -package nu.marginalia.pubdate; +package nu.marginalia.converting.processor.pubdate; import nu.marginalia.model.crawl.PubDate; import org.junit.jupiter.api.Test; diff --git a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/summary/SummaryExtractorTest.java similarity index 93% rename from code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java rename to code/processes/converting-process/test/nu/marginalia/converting/processor/summary/SummaryExtractorTest.java index cabe558f..2b4dc30e 100644 --- a/code/features-convert/summary-extraction/test/nu/marginalia/summary/SummaryExtractorTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/summary/SummaryExtractorTest.java @@ -1,13 +1,12 @@ -package nu.marginalia.summary; +package nu.marginalia.converting.processor.summary; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; +import nu.marginalia.converting.processor.summary.heuristic.*; import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.LinkTexts; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.segmentation.NgramLexicon; -import nu.marginalia.summary.heuristic.*; -import nu.marginalia.term_frequency_dict.TermFrequencyDict; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.junit.jupiter.api.Assertions; @@ -26,9 +25,7 @@ class SummaryExtractorTest { @BeforeEach public void setUp() { - keywordExtractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), - new NgramLexicon(WmsaHome.getLanguageModels())); + keywordExtractor = new DocumentKeywordExtractor(); setenceExtractor = new SentenceExtractor(WmsaHome.getLanguageModels()); summaryExtractor = new SummaryExtractor(255, @@ -42,7 +39,7 @@ class SummaryExtractorTest { @SneakyThrows Set getImportantWords(Document doc) { var dld = setenceExtractor.extractSentences(doc); - var keywords = keywordExtractor.extractKeywords(dld, new EdgeUrl( + var keywords = keywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl( "https://www.marginalia.nu/" )); System.out.println(keywords.importantWords); diff --git a/code/features-convert/summary-extraction/test/nu/marginalia/summary/heuristic/HeuristicTextUtilTest.java b/code/processes/converting-process/test/nu/marginalia/converting/processor/summary/heuristic/HeuristicTextUtilTest.java similarity index 93% rename from code/features-convert/summary-extraction/test/nu/marginalia/summary/heuristic/HeuristicTextUtilTest.java rename to code/processes/converting-process/test/nu/marginalia/converting/processor/summary/heuristic/HeuristicTextUtilTest.java index 9ea11fac..38da765e 100644 --- a/code/features-convert/summary-extraction/test/nu/marginalia/summary/heuristic/HeuristicTextUtilTest.java +++ b/code/processes/converting-process/test/nu/marginalia/converting/processor/summary/heuristic/HeuristicTextUtilTest.java @@ -1,5 +1,6 @@ -package nu.marginalia.summary.heuristic; +package nu.marginalia.converting.processor.summary.heuristic; +import nu.marginalia.converting.processor.summary.heuristic.HeuristicTextUtil; import org.junit.jupiter.api.Test; import java.util.Set; diff --git a/code/features-convert/reddit-json/test/nu/marginalia/integration/reddit/RedditEntryReaderTest.java b/code/processes/converting-process/test/nu/marginalia/integration/reddit/RedditEntryReaderTest.java similarity index 100% rename from code/features-convert/reddit-json/test/nu/marginalia/integration/reddit/RedditEntryReaderTest.java rename to code/processes/converting-process/test/nu/marginalia/integration/reddit/RedditEntryReaderTest.java diff --git a/code/features-convert/reddit-json/test/nu/marginalia/integration/reddit/db/RedditDbTest.java b/code/processes/converting-process/test/nu/marginalia/integration/reddit/db/RedditDbTest.java similarity index 100% rename from code/features-convert/reddit-json/test/nu/marginalia/integration/reddit/db/RedditDbTest.java rename to code/processes/converting-process/test/nu/marginalia/integration/reddit/db/RedditDbTest.java diff --git a/code/features-convert/stackexchange-xml/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java b/code/processes/converting-process/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java similarity index 100% rename from code/features-convert/stackexchange-xml/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java rename to code/processes/converting-process/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlCommentReaderTest.java diff --git a/code/features-convert/stackexchange-xml/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java b/code/processes/converting-process/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java similarity index 100% rename from code/features-convert/stackexchange-xml/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java rename to code/processes/converting-process/test/nu/marginalia/integration/stackexchange/xml/StackExchangeXmlPostReaderTest.java diff --git a/code/features-convert/stackexchange-xml/test/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java b/code/processes/converting-process/test/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java similarity index 100% rename from code/features-convert/stackexchange-xml/test/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java rename to code/processes/converting-process/test/nu/marginalia/integration/stackexchange/xml/StringXmlTestEventReader.java diff --git a/code/processes/crawling-process/build.gradle b/code/processes/crawling-process/build.gradle index 5105543d..2d34904f 100644 --- a/code/processes/crawling-process/build.gradle +++ b/code/processes/crawling-process/build.gradle @@ -29,18 +29,18 @@ dependencies { implementation project(':code:common:service') implementation project(':code:libraries:blocking-thread-pool') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:libraries:message-queue') implementation project(':code:libraries:language-processing') implementation project(':code:libraries:easy-lsh') - implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:crawl-spec') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:model') - implementation project(':code:features-convert:anchor-keywords') - implementation project(':code:features-crawl:crawl-blocklist') - implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-crawl:content-type') + implementation project(':code:processes:converting-process:ft-anchor-keywords') + implementation project(':code:processes:crawling-process:ft-crawl-blocklist') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:processes:crawling-process:ft-content-type') implementation project(':third-party:commons-codec') implementation libs.bundles.slf4j diff --git a/code/features-crawl/content-type/build.gradle b/code/processes/crawling-process/ft-content-type/build.gradle similarity index 100% rename from code/features-crawl/content-type/build.gradle rename to code/processes/crawling-process/ft-content-type/build.gradle diff --git a/code/features-crawl/content-type/java/nu/marginalia/contenttype/ContentType.java b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentType.java similarity index 100% rename from code/features-crawl/content-type/java/nu/marginalia/contenttype/ContentType.java rename to code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentType.java diff --git a/code/features-crawl/content-type/java/nu/marginalia/contenttype/ContentTypeParser.java b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentTypeParser.java similarity index 100% rename from code/features-crawl/content-type/java/nu/marginalia/contenttype/ContentTypeParser.java rename to code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/ContentTypeParser.java diff --git a/code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java similarity index 51% rename from code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java rename to code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java index 7fe604f4..8187871e 100644 --- a/code/features-crawl/content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java +++ b/code/processes/crawling-process/ft-content-type/java/nu/marginalia/contenttype/DocumentBodyToString.java @@ -1,31 +1,46 @@ package nu.marginalia.contenttype; -import java.nio.charset.*; +import java.nio.charset.Charset; +import java.nio.charset.IllegalCharsetNameException; +import java.nio.charset.StandardCharsets; +import java.nio.charset.UnsupportedCharsetException; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; public class DocumentBodyToString { + private static final Map charsetMap = new ConcurrentHashMap<>(); /** Get the string data from a document body, given the content type and charset */ public static String getStringData(ContentType type, byte[] data) { - Charset charset; + final Charset charset; + + if (type.charset() == null || type.charset().isBlank()) { + charset = StandardCharsets.UTF_8; + } else { + charset = charsetMap.computeIfAbsent(type, DocumentBodyToString::computeCharset); + } + + return new String(data, charset); + } + + private static Charset computeCharset(ContentType type) { try { if (type.charset() == null || type.charset().isBlank()) - charset = StandardCharsets.UTF_8; + return StandardCharsets.UTF_8; else { - charset = Charset.forName(type.charset()); + return Charset.forName(type.charset()); } } catch (IllegalCharsetNameException ex) { // Fall back to UTF-8 if we don't understand what this is. It's *probably* fine? Maybe? - charset = StandardCharsets.UTF_8; + return StandardCharsets.UTF_8; } catch (UnsupportedCharsetException ex) { // This is usually like Macintosh Latin // (https://en.wikipedia.org/wiki/Macintosh_Latin_encoding) // // It's close enough to 8859-1 to serve - charset = StandardCharsets.ISO_8859_1; + return StandardCharsets.ISO_8859_1; } - - return new String(data, charset); } } diff --git a/code/features-crawl/content-type/test/nu/marginalia/contenttype/ContentTypeParserTest.java b/code/processes/crawling-process/ft-content-type/test/nu/marginalia/contenttype/ContentTypeParserTest.java similarity index 100% rename from code/features-crawl/content-type/test/nu/marginalia/contenttype/ContentTypeParserTest.java rename to code/processes/crawling-process/ft-content-type/test/nu/marginalia/contenttype/ContentTypeParserTest.java diff --git a/code/features-crawl/content-type/test/nu/marginalia/contenttype/DocumentBodyToStringTest.java b/code/processes/crawling-process/ft-content-type/test/nu/marginalia/contenttype/DocumentBodyToStringTest.java similarity index 100% rename from code/features-crawl/content-type/test/nu/marginalia/contenttype/DocumentBodyToStringTest.java rename to code/processes/crawling-process/ft-content-type/test/nu/marginalia/contenttype/DocumentBodyToStringTest.java diff --git a/code/features-crawl/crawl-blocklist/build.gradle b/code/processes/crawling-process/ft-crawl-blocklist/build.gradle similarity index 100% rename from code/features-crawl/crawl-blocklist/build.gradle rename to code/processes/crawling-process/ft-crawl-blocklist/build.gradle diff --git a/code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java b/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java similarity index 100% rename from code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java rename to code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/GeoIpBlocklist.java diff --git a/code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/InetAddressCache.java b/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/InetAddressCache.java similarity index 100% rename from code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/InetAddressCache.java rename to code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/InetAddressCache.java diff --git a/code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/IpBlockList.java b/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/IpBlockList.java similarity index 100% rename from code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/IpBlockList.java rename to code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/IpBlockList.java diff --git a/code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java b/code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java similarity index 100% rename from code/features-crawl/crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java rename to code/processes/crawling-process/ft-crawl-blocklist/java/nu/marginalia/ip_blocklist/UrlBlocklist.java diff --git a/code/features-crawl/crawl-blocklist/readme.md b/code/processes/crawling-process/ft-crawl-blocklist/readme.md similarity index 100% rename from code/features-crawl/crawl-blocklist/readme.md rename to code/processes/crawling-process/ft-crawl-blocklist/readme.md diff --git a/code/features-crawl/crawl-blocklist/test/nu/marginalia/ip_blocklist/UrlBlocklistTest.java b/code/processes/crawling-process/ft-crawl-blocklist/test/nu/marginalia/ip_blocklist/UrlBlocklistTest.java similarity index 100% rename from code/features-crawl/crawl-blocklist/test/nu/marginalia/ip_blocklist/UrlBlocklistTest.java rename to code/processes/crawling-process/ft-crawl-blocklist/test/nu/marginalia/ip_blocklist/UrlBlocklistTest.java diff --git a/code/features-crawl/link-parser/build.gradle b/code/processes/crawling-process/ft-link-parser/build.gradle similarity index 100% rename from code/features-crawl/link-parser/build.gradle rename to code/processes/crawling-process/ft-link-parser/build.gradle diff --git a/code/features-crawl/link-parser/java/nu/marginalia/link_parser/FeedExtractor.java b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/FeedExtractor.java similarity index 100% rename from code/features-crawl/link-parser/java/nu/marginalia/link_parser/FeedExtractor.java rename to code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/FeedExtractor.java diff --git a/code/features-crawl/link-parser/java/nu/marginalia/link_parser/LinkParser.java b/code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java similarity index 100% rename from code/features-crawl/link-parser/java/nu/marginalia/link_parser/LinkParser.java rename to code/processes/crawling-process/ft-link-parser/java/nu/marginalia/link_parser/LinkParser.java diff --git a/code/features-crawl/link-parser/readme.md b/code/processes/crawling-process/ft-link-parser/readme.md similarity index 100% rename from code/features-crawl/link-parser/readme.md rename to code/processes/crawling-process/ft-link-parser/readme.md diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java index 7ee043d5..d8d49f3b 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/CrawlerMain.java @@ -21,16 +21,16 @@ import nu.marginalia.crawl.spec.DbCrawlSpecProvider; import nu.marginalia.crawl.spec.ParquetCrawlSpecProvider; import nu.marginalia.crawl.warc.WarcArchiverFactory; import nu.marginalia.crawl.warc.WarcArchiverIf; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.CrawlerOutputFile; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.crawlspec.CrawlSpecFileNames; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.CrawlerOutputFile; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.process.control.ProcessHeartbeatImpl; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.ProcessMainClass; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java index 65e1529b..8b34cb77 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlDataReference.java @@ -2,9 +2,9 @@ package nu.marginalia.crawl.retreival; import com.google.common.hash.HashFunction; import com.google.common.hash.Hashing; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.lsh.EasyLSH; +import nu.marginalia.model.crawldata.CrawledDocument; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java index 37f84d58..c7fee792 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawledDocumentFactory.java @@ -1,9 +1,9 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import java.time.LocalDateTime; import java.util.Objects; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java index b7345050..457c524c 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerRetreiver.java @@ -9,13 +9,13 @@ import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.crawl.retreival.revisit.CrawlerRevisitor; import nu.marginalia.crawl.retreival.revisit.DocumentWithReference; import nu.marginalia.crawl.retreival.sitemap.SitemapFetcher; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.ip_blocklist.UrlBlocklist; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import org.jsoup.Jsoup; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java index ab1ce5ef..1468d6ed 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/CrawlerWarcResynchronizer.java @@ -1,9 +1,9 @@ package nu.marginalia.crawl.retreival; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.HttpFetchResult; import org.jsoup.Jsoup; import org.netpreserve.jwarc.*; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java index 57147aec..3ec9b8da 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/DomainProber.java @@ -4,10 +4,10 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import nu.marginalia.crawl.retreival.fetcher.FetchResultState; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; -import nu.marginalia.crawling.model.CrawlerDomainStatus; import nu.marginalia.ip_blocklist.IpBlockList; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawlerDomainStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java index 96e2eaa7..c9997017 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/ContentTypeProber.java @@ -1,7 +1,7 @@ package nu.marginalia.crawl.retreival.fetcher; -import nu.marginalia.crawling.body.ContentTypeLogic; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.ContentTypeLogic; import okhttp3.OkHttpClient; import okhttp3.Request; import org.slf4j.Logger; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java index fd3dd0dd..42723d5c 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcher.java @@ -4,9 +4,9 @@ import com.google.inject.ImplementedBy; import crawlercommons.robots.SimpleRobotRules; import nu.marginalia.crawl.retreival.RateLimitException; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; import java.util.List; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java index 6ec3cd73..40725b0f 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/HttpFetcherImpl.java @@ -12,11 +12,11 @@ import nu.marginalia.crawl.retreival.fetcher.socket.FastTerminatingSocketFactory import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.socket.NoSecuritySSL; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.ContentTypeLogic; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.ContentTypeLogic; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.HttpFetchResult; import okhttp3.ConnectionPool; import okhttp3.Dispatcher; import okhttp3.OkHttpClient; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java index 180811cf..1d4a4372 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/fetcher/warc/WarcRecorder.java @@ -2,10 +2,10 @@ package nu.marginalia.crawl.retreival.fetcher.warc; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawling.body.HttpFetchResult; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; import okhttp3.OkHttpClient; import okhttp3.Request; import org.netpreserve.jwarc.*; diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java index 6b32317d..f5bb863e 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/CrawlerRevisitor.java @@ -8,9 +8,9 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainCrawlFrontier; import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; import org.jsoup.Jsoup; /** This class encapsulates the logic for re-visiting a domain that has already been crawled. diff --git a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java index c604ff5b..b5589401 100644 --- a/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java +++ b/code/processes/crawling-process/java/nu/marginalia/crawl/retreival/revisit/DocumentWithReference.java @@ -2,10 +2,10 @@ package nu.marginalia.crawl.retreival.revisit; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.DocumentBodyResult; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; import javax.annotation.Nullable; diff --git a/code/process-models/crawling-model/build.gradle b/code/processes/crawling-process/model/build.gradle similarity index 88% rename from code/process-models/crawling-model/build.gradle rename to code/processes/crawling-process/model/build.gradle index 2a24d8bf..50103c41 100644 --- a/code/process-models/crawling-model/build.gradle +++ b/code/processes/crawling-process/model/build.gradle @@ -12,6 +12,8 @@ java { } } +jar.archiveBaseName = 'crawling-process-model' + apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { @@ -20,7 +22,7 @@ dependencies { implementation project(':code:common:config') implementation project(':code:common:process') implementation project(':code:index:api') - implementation project(':code:features-crawl:content-type') + implementation project(':code:processes:crawling-process:ft-content-type') implementation project(':code:libraries:language-processing') implementation project(':third-party:parquet-floor') implementation project(':third-party:commons-codec') @@ -30,6 +32,7 @@ dependencies { implementation libs.notnull implementation libs.bundles.parquet + implementation libs.trove implementation libs.jwarc implementation libs.gson implementation libs.commons.io diff --git a/code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java b/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java rename to code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecFileNames.java diff --git a/code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java b/code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java rename to code/processes/crawling-process/model/java/nu/marginalia/crawlspec/CrawlSpecGenerator.java diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java similarity index 93% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java index 5aefc04c..7e359814 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawledDomainReader.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawledDomainReader.java @@ -1,6 +1,6 @@ -package nu.marginalia.crawling.io; +package nu.marginalia.io.crawldata; -import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java similarity index 98% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java index 05c4797e..266a7f24 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/CrawlerOutputFile.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/CrawlerOutputFile.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.io; +package nu.marginalia.io.crawldata; import org.apache.logging.log4j.util.Strings; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java similarity index 94% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java index ce01ebce..1ade3836 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java @@ -1,6 +1,6 @@ -package nu.marginalia.crawling.io; +package nu.marginalia.io.crawldata; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.model.crawldata.SerializableCrawlData; import org.jetbrains.annotations.Nullable; import java.io.IOException; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java similarity index 95% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java index e676e351..55c5ce8e 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/io/format/ParquetSerializableCrawlDataStream.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/io/crawldata/format/ParquetSerializableCrawlDataStream.java @@ -1,14 +1,14 @@ -package nu.marginalia.crawling.io.format; +package nu.marginalia.io.crawldata.format; import lombok.SneakyThrows; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.DocumentBodyToString; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.*; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecord; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.*; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileReader.java diff --git a/code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java rename to code/processes/crawling-process/model/java/nu/marginalia/io/crawlspec/CrawlSpecRecordParquetFileWriter.java diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java similarity index 98% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java index 25d4c8ec..c38bcb3b 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/ContentTypeLogic.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/ContentTypeLogic.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; import nu.marginalia.model.EdgeUrl; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java similarity index 96% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java index 7c8f471c..ebd3d33e 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyExtractor.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyExtractor.java @@ -1,9 +1,9 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; import nu.marginalia.contenttype.ContentTypeParser; import nu.marginalia.contenttype.DocumentBodyToString; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java similarity index 95% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyResult.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java index 04e3fedb..a29e7093 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/DocumentBodyResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/DocumentBodyResult.java @@ -1,7 +1,7 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; import java.util.Optional; import java.util.function.BiFunction; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/HttpFetchResult.java b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java similarity index 99% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/body/HttpFetchResult.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java index 6bafaf5c..d3fd41b0 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/body/HttpFetchResult.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/body/HttpFetchResult.java @@ -1,11 +1,11 @@ -package nu.marginalia.crawling.body; +package nu.marginalia.model.body; import nu.marginalia.contenttype.ContentType; import okhttp3.Headers; import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; import org.netpreserve.jwarc.MessageHeaders; import org.netpreserve.jwarc.WarcResponse; -import org.jsoup.nodes.Document; import java.io.ByteArrayInputStream; import java.io.IOException; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java similarity index 98% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java index c809682a..f43433b9 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDocument.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDocument.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java similarity index 94% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java index adb59bda..3cb1ea51 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawledDomain.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawledDomain.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; import lombok.AllArgsConstructor; import lombok.Builder; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDocumentStatus.java similarity index 80% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDocumentStatus.java index 2369bcc6..d796c6de 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDocumentStatus.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDocumentStatus.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; public enum CrawlerDocumentStatus { OK, diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDomainStatus.java similarity index 64% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDomainStatus.java index 12a31c52..4efc9c59 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/CrawlerDomainStatus.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/CrawlerDomainStatus.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; public enum CrawlerDomainStatus { OK, ERROR, BLOCKED, REDIRECT diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java similarity index 63% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java index 01ecaf8d..58d25dea 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/model/SerializableCrawlData.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/model/crawldata/SerializableCrawlData.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.model; +package nu.marginalia.model.crawldata; public interface SerializableCrawlData { String getDomain(); diff --git a/code/process-models/crawl-spec/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java b/code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java similarity index 100% rename from code/process-models/crawl-spec/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java rename to code/processes/crawling-process/model/java/nu/marginalia/model/crawlspec/CrawlSpecRecord.java diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java similarity index 97% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java rename to code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java index 55deafdb..e4ce7ad9 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.parquet; +package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.Dehydrator; import blue.strategic.parquet.Hydrator; @@ -12,7 +12,7 @@ import org.apache.parquet.schema.Types; import java.time.Instant; -import static org.apache.parquet.schema.LogicalTypeAnnotation.*; +import static org.apache.parquet.schema.LogicalTypeAnnotation.stringType; import static org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.*; @AllArgsConstructor diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java similarity index 97% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java rename to code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java index 362eb561..6e4ea942 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java @@ -1,4 +1,4 @@ -package nu.marginalia.crawling.parquet; +package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.Hydrator; import blue.strategic.parquet.HydratorSupplier; diff --git a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java similarity index 97% rename from code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java rename to code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java index 539ff28d..36a58673 100644 --- a/code/process-models/crawling-model/java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java +++ b/code/processes/crawling-process/model/java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java @@ -1,10 +1,10 @@ -package nu.marginalia.crawling.parquet; +package nu.marginalia.parquet.crawldata; import blue.strategic.parquet.ParquetWriter; import nu.marginalia.UserAgent; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; -import nu.marginalia.crawling.body.HttpFetchResult; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.DocumentBodyResult; +import nu.marginalia.model.body.HttpFetchResult; import org.apache.commons.lang3.StringUtils; import org.netpreserve.jwarc.*; import org.slf4j.Logger; diff --git a/code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java similarity index 100% rename from code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java rename to code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXCookieInformationHeader.java diff --git a/code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXEntityRefused.java b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXEntityRefused.java similarity index 100% rename from code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXEntityRefused.java rename to code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXEntityRefused.java diff --git a/code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXResponseReference.java b/code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXResponseReference.java similarity index 100% rename from code/process-models/crawling-model/java/org/netpreserve/jwarc/WarcXResponseReference.java rename to code/processes/crawling-process/model/java/org/netpreserve/jwarc/WarcXResponseReference.java diff --git a/code/process-models/crawling-model/readme.md b/code/processes/crawling-process/model/readme.md similarity index 75% rename from code/process-models/crawling-model/readme.md rename to code/processes/crawling-process/model/readme.md index 3bb9cb58..c48a5db9 100644 --- a/code/process-models/crawling-model/readme.md +++ b/code/processes/crawling-process/model/readme.md @@ -1,7 +1,7 @@ # Crawling Models -Contains crawl data models shared by the [crawling-process](../../processes/crawling-process/) and -[converting-process](../../processes/converting-process/). +Contains crawl data models shared by the [crawling-process](../../) and +[converting-process](../../../processes/converting-process/). To ensure backward compatibility with older versions of the data, the serialization is abstracted away from the model classes. @@ -15,27 +15,26 @@ removed in the future. ## Central Classes -* [CrawledDocument](java/nu/marginalia/crawling/model/CrawledDocument.java) -* [CrawledDomain](java/nu/marginalia/crawling/model/CrawledDomain.java) +* [CrawledDocument](java/nu/marginalia/model/crawldata/CrawledDocument.java) +* [CrawledDomain](java/nu/marginalia/model/crawldata/CrawledDomain.java) ### Serialization These serialization classes automatically negotiate the serialization format based on the file extension. -Data is accessed through a [SerializableCrawlDataStream](java/nu/marginalia/crawling/io/SerializableCrawlDataStream.java), +Data is accessed through a [SerializableCrawlDataStream](java/nu/marginalia/io/crawldata/SerializableCrawlDataStream.java), which is a somewhat enhanced Iterator that can be used to read data. -* [CrawledDomainReader](java/nu/marginalia/crawling/io/CrawledDomainReader.java) -* [CrawledDomainWriter](java/nu/marginalia/crawling/io/CrawledDomainWriter.java) +* [CrawledDomainReader](java/nu/marginalia/io/crawldata/CrawledDomainReader.java) ### Parquet Serialization -The parquet serialization is done using the [CrawledDocumentParquetRecordFileReader](java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileReader.java) -and [CrawledDocumentParquetRecordFileWriter](java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriter.java) classes, +The parquet serialization is done using the [CrawledDocumentParquetRecordFileReader](java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileReader.java) +and [CrawledDocumentParquetRecordFileWriter](java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecordFileWriter.java) classes, which read and write parquet files respectively. -The model classes are serialized to parquet using the [CrawledDocumentParquetRecord](java/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecord.java) +The model classes are serialized to parquet using the [CrawledDocumentParquetRecord](java/nu/marginalia/parquet/crawldata/CrawledDocumentParquetRecord.java) The record has the following fields: diff --git a/code/process-models/crawling-model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java b/code/processes/crawling-process/model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java similarity index 94% rename from code/process-models/crawling-model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java rename to code/processes/crawling-process/model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java index 8612fd39..fdfe52a4 100644 --- a/code/process-models/crawling-model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java +++ b/code/processes/crawling-process/model/test/nu/marginalia/crawling/model/CrawledDocumentTest.java @@ -1,8 +1,10 @@ package nu.marginalia.crawling.model; +import nu.marginalia.model.crawldata.CrawledDocument; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.*; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; class CrawledDocumentTest { diff --git a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java b/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java similarity index 90% rename from code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java rename to code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java index a0352f29..0da0f6d8 100644 --- a/code/process-models/crawling-model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/processes/crawling-process/model/test/nu/marginalia/crawling/parquet/CrawledDocumentParquetRecordFileWriterTest.java @@ -1,9 +1,11 @@ package nu.marginalia.crawling.parquet; -import nu.marginalia.crawling.io.format.ParquetSerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; +import nu.marginalia.io.crawldata.format.ParquetSerializableCrawlDataStream; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java index d3369bcc..ebda28e1 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/CrawledDocumentParquetRecordFileWriterTest.java @@ -1,6 +1,6 @@ package nu.marginalia.crawl.retreival.fetcher; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.junit.jupiter.api.Test; import java.util.List; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java index 206bf798..a9df80ac 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/fetcher/WarcRecorderTest.java @@ -3,15 +3,18 @@ package nu.marginalia.crawl.retreival.fetcher; import nu.marginalia.UserAgent; import nu.marginalia.crawl.retreival.fetcher.socket.IpInterceptingNetworkInterceptor; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileReader; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileReader; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import okhttp3.OkHttpClient; import okhttp3.Request; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import org.netpreserve.jwarc.*; +import org.netpreserve.jwarc.WarcReader; +import org.netpreserve.jwarc.WarcRequest; +import org.netpreserve.jwarc.WarcResponse; +import org.netpreserve.jwarc.WarcXResponseReference; import java.io.IOException; import java.net.URISyntaxException; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java index e711c81c..9d46ec75 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawl/retreival/revisit/DocumentWithReferenceTest.java @@ -2,7 +2,7 @@ package nu.marginalia.crawl.retreival.revisit; import nu.marginalia.crawl.retreival.CrawlDataReference; import nu.marginalia.crawl.retreival.fetcher.ContentTags; -import nu.marginalia.crawling.model.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDocument; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java index af196da7..611cc8c2 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/HttpFetcherTest.java @@ -6,10 +6,10 @@ import nu.marginalia.crawl.retreival.fetcher.ContentTags; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.ContentTypeLogic; -import nu.marginalia.crawling.body.DocumentBodyExtractor; -import nu.marginalia.crawling.body.DocumentBodyResult; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.ContentTypeLogic; +import nu.marginalia.model.body.DocumentBodyExtractor; +import nu.marginalia.model.body.DocumentBodyResult; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java index 01534385..45986bbc 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerMockFetcherTest.java @@ -6,12 +6,12 @@ import nu.marginalia.crawl.retreival.CrawlerRetreiver; import nu.marginalia.crawl.retreival.DomainProber; import nu.marginalia.crawl.retreival.fetcher.*; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.body.HttpFetchResult; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawlerDocumentStatus; -import nu.marginalia.crawling.model.SerializableCrawlData; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.body.HttpFetchResult; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawlerDocumentStatus; +import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; import nu.marginalia.test.CommonTestData; import okhttp3.Headers; diff --git a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java index a6df0791..1b541b63 100644 --- a/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java +++ b/code/processes/crawling-process/test/nu/marginalia/crawling/retreival/CrawlerRetreiverTest.java @@ -8,15 +8,15 @@ import nu.marginalia.crawl.retreival.*; import nu.marginalia.crawl.retreival.fetcher.HttpFetcher; import nu.marginalia.crawl.retreival.fetcher.HttpFetcherImpl; import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; -import nu.marginalia.crawling.io.CrawledDomainReader; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.crawling.model.SerializableCrawlData; -import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeDomain; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; +import nu.marginalia.model.crawldata.SerializableCrawlData; import nu.marginalia.model.crawlspec.CrawlSpecRecord; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; import org.jetbrains.annotations.NotNull; import org.junit.jupiter.api.*; import org.netpreserve.jwarc.*; diff --git a/code/processes/index-constructor-process/build.gradle b/code/processes/index-constructor-process/build.gradle index 4653133a..6de7e773 100644 --- a/code/processes/index-constructor-process/build.gradle +++ b/code/processes/index-constructor-process/build.gradle @@ -21,7 +21,7 @@ tasks.distZip.enabled = false apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:common:process') implementation project(':code:common:service') implementation project(':code:common:db') diff --git a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java index 47d3fba2..ef93b554 100644 --- a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java @@ -6,17 +6,14 @@ import com.google.inject.Inject; import nu.marginalia.IndexLocations; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; +import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.index.construction.ReverseIndexConstructor; -import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; -import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; +import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; -import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.mq.MessageQueueFactory; import nu.marginalia.mq.MqMessage; import nu.marginalia.mq.inbox.MqInboxResponse; @@ -24,7 +21,9 @@ import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.mqapi.index.CreateIndexRequest; import nu.marginalia.mqapi.index.IndexName; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.storage.FileStorageService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,7 +34,6 @@ import java.sql.SQLException; import java.util.Optional; import java.util.UUID; import java.util.concurrent.TimeUnit; -import java.util.function.LongPredicate; import static nu.marginalia.mqapi.ProcessInboxNames.INDEX_CONSTRUCTOR_INBOX; @@ -106,70 +104,58 @@ public class IndexConstructorMain extends ProcessMainClass { heartbeat.shutDown(); } - private void createFullReverseIndex() throws SQLException, IOException { + private void createFullReverseIndex() throws IOException { Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + var constructor = new FullIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + this::addRankToIdEncoding, + tmpDir); - new ReverseIndexConstructor(outputFileDocs, outputFileWords, - IndexJournalReader::singleFile, - this::addRankToIdEncoding, tmpDir) - .createReverseIndex(heartbeat, - "createReverseIndexFull", - workDir); + constructor.createReverseIndex(heartbeat, "createReverseIndexFull", workDir); } - private void createPrioReverseIndex() throws SQLException, IOException { + private void createPrioReverseIndex() throws IOException { Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); Path tmpDir = workDir.resolve("tmp"); - // The priority index only includes words that have bits indicating they are - // important to the document. This filter will act on the encoded {@see WordMetadata} - LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter(); + var constructor = new PrioIndexConstructor( + outputFileDocs, + outputFileWords, + this::addRankToIdEncoding, + tmpDir); - new ReverseIndexConstructor(outputFileDocs, outputFileWords, - (path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter), - this::addRankToIdEncoding, tmpDir) - .createReverseIndex(heartbeat, - "createReverseIndexPrio", - workDir); - } - - private static LongPredicate getPriorityIndexWordMetaFilter() { - - long highPriorityFlags = - WordFlags.Title.asBit() - | WordFlags.Subjects.asBit() - | WordFlags.TfIdfHigh.asBit() - | WordFlags.NamesWords.asBit() - | WordFlags.UrlDomain.asBit() - | WordFlags.UrlPath.asBit() - | WordFlags.Site.asBit() - | WordFlags.ExternalLink.asBit() - | WordFlags.SiteAdjacent.asBit(); - - return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags); + constructor.createReverseIndex(heartbeat, "createReverseIndexPrio", workDir); } private void createForwardIndex() throws IOException { Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat, - IndexJournalReader.paging(workDir), outputFileDocsId, outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), domainRankings ); diff --git a/code/processes/loading-process/build.gradle b/code/processes/loading-process/build.gradle index 7131d4ea..0d9d51c1 100644 --- a/code/processes/loading-process/build.gradle +++ b/code/processes/loading-process/build.gradle @@ -21,7 +21,7 @@ apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { implementation project(':code:common:process') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:index:api') implementation project(':code:common:model') implementation project(':code:common:db') @@ -31,18 +31,20 @@ dependencies { implementation project(':code:index:index-journal') implementation project(':code:libraries:message-queue') implementation project(':code:libraries:language-processing') + implementation project(':code:libraries:coded-sequence') implementation project(':third-party:commons-codec') + implementation project(':third-party:parquet-floor') testImplementation project(':code:services-application:search-service') - implementation project(':code:process-models:crawling-model') - implementation project(':code:process-models:processed-data') - implementation project(':code:process-models:work-log') - implementation project(':code:features-convert:keyword-extraction') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:converting-process:model') + implementation project(':code:processes:converting-process:ft-keyword-extraction') implementation project(':code:functions:link-graph:partition') implementation libs.bundles.slf4j + implementation libs.slop implementation libs.guava implementation dependencies.create(libs.guice.get()) { exclude group: 'com.google.guava' @@ -50,6 +52,7 @@ dependencies { implementation libs.gson implementation libs.commons.lang3 implementation libs.zstd + implementation libs.roaringbitmap implementation libs.trove implementation libs.bundles.mariadb @@ -59,6 +62,7 @@ dependencies { testImplementation libs.bundles.selenium testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java index 2dee50fa..08c016db 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderIndexJournalWriter.java @@ -4,93 +4,59 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; import nu.marginalia.IndexLocations; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.journal.IndexJournalSlopWriter; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.storage.FileStorageService; -import nu.marginalia.hash.MurmurHash3_128; -import nu.marginalia.index.journal.model.IndexJournalEntryData; -import nu.marginalia.index.journal.model.IndexJournalEntryHeader; -import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; -import nu.marginalia.index.journal.writer.IndexJournalWriter; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.index.journal.IndexJournalFileNames; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Files; -import java.sql.SQLException; +import java.nio.file.Path; -import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH; @Singleton public class LoaderIndexJournalWriter { - private final IndexJournalWriter indexWriter; private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class); + private final Path journalPath; - private final MurmurHash3_128 hasher = new MurmurHash3_128(); - private final long[] buffer = new long[MAX_LENGTH * 2]; - + private IndexJournalSlopWriter currentWriter = null; + private long recordsWritten = 0; + private int page; @Inject - public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException { + public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException { var indexArea = IndexLocations.getIndexConstructionArea(fileStorageService); - var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea); - for (var existingFile : existingIndexFiles) { - Files.delete(existingFile); - } + journalPath = IndexJournal.allocateName(indexArea); + page = IndexJournal.numPages(journalPath); - indexWriter = new IndexJournalWriterPagingImpl(indexArea); + switchToNextVersion(); + + logger.info("Creating Journal Writer {}", indexArea); } - public void putWords(long combinedId, - int features, - DocumentMetadata metadata, - DocumentKeywords wordSet) { + private void switchToNextVersion() throws IOException { + if (currentWriter != null) { + currentWriter.close(); + } - putWords(combinedId, features, metadata.encode(), wordSet); + currentWriter = new IndexJournalSlopWriter(journalPath, page++); } @SneakyThrows - public void putWords(long combinedId, - int features, - long metadata, - DocumentKeywords wordSet) { - - if (wordSet.isEmpty()) { - logger.info("Skipping zero-length word set for {}", combinedId); - return; - } - - if (combinedId <= 0) { - logger.warn("Bad ID: {}", combinedId); - return; - } - - var pointer = wordSet.newPointer(); - - while (pointer.hasMore()) { - int i = 0; - - while (i < buffer.length - && pointer.advancePointer()) - { - final long hashedKeyword = hasher.hashKeyword(pointer.getKeyword()); - - buffer[i++] = hashedKeyword; - buffer[i++] = pointer.getMetadata(); - } - - var entry = new IndexJournalEntryData(i, buffer); - var header = new IndexJournalEntryHeader(combinedId, features, metadata); - - indexWriter.put(header, entry); + public void putWords(long header, SlopDocumentRecord.KeywordsProjection data) + { + if (++recordsWritten > 200_000) { + recordsWritten = 0; + switchToNextVersion(); } + currentWriter.put(header, data); } - public void close() throws Exception { - indexWriter.close(); + public void close() throws IOException { + currentWriter.close(); } } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java index 21f878f0..b874bf05 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderInputData.java @@ -1,6 +1,10 @@ package nu.marginalia.loading; import nu.marginalia.io.processed.ProcessedDataFileNames; +import nu.marginalia.model.processed.SlopDocumentRecord; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainRecord; +import nu.marginalia.slop.SlopTable; import nu.marginalia.worklog.BatchingWorkLogInspector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,26 +43,35 @@ public class LoaderInputData { lastGoodBatch.put(singleSource, lastBatch); } - public Collection listDomainFiles() { - List pathsAll = new ArrayList<>(); + public Collection> listDomainPages() { + List> pathsAll = new ArrayList<>(); + for (var source : sourceDirectories) { - pathsAll.addAll(ProcessedDataFileNames.listDomainFiles(source, lastGoodBatch.get(source))); + for (int i = 0; i < lastGoodBatch.get(source); i++) { + pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.domainFileName(source), i)); + } } return pathsAll; } - public Collection listDomainLinkFiles() { - List pathsAll = new ArrayList<>(); + public Collection> listDomainLinkPages() { + List> pathsAll = new ArrayList<>(); + for (var source : sourceDirectories) { - pathsAll.addAll(ProcessedDataFileNames.listDomainLinkFiles(source, lastGoodBatch.get(source))); + for (int i = 0; i < lastGoodBatch.get(source); i++) { + pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.domainLinkFileName(source), i)); + } } return pathsAll; } - public Collection listDocumentFiles() { - List pathsAll = new ArrayList<>(); + public Collection> listDocumentFiles() { + List> pathsAll = new ArrayList<>(); + for (var source : sourceDirectories) { - pathsAll.addAll(ProcessedDataFileNames.listDocumentFiles(source, lastGoodBatch.get(source))); + for (int i = 0; i < lastGoodBatch.get(source); i++) { + pathsAll.add(new SlopTable.Ref<>(ProcessedDataFileNames.documentFileName(source), i)); + } } return pathsAll; } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java b/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java index 617088de..4171337f 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/LoaderMain.java @@ -8,8 +8,6 @@ import lombok.Getter; import lombok.SneakyThrows; import nu.marginalia.ProcessConfiguration; import nu.marginalia.ProcessConfigurationModule; -import nu.marginalia.service.ProcessMainClass; -import nu.marginalia.storage.FileStorageService; import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.loading.documents.DocumentLoaderService; import nu.marginalia.loading.documents.KeywordLoaderService; @@ -22,7 +20,9 @@ import nu.marginalia.mq.MqMessageState; import nu.marginalia.mq.inbox.MqInboxResponse; import nu.marginalia.mq.inbox.MqSingleShotInbox; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.service.ProcessMainClass; import nu.marginalia.service.module.DatabaseModule; +import nu.marginalia.storage.FileStorageService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -45,7 +45,6 @@ public class LoaderMain extends ProcessMainClass { private final MessageQueueFactory messageQueueFactory; private final FileStorageService fileStorageService; private final DocumentDbWriter documentDbWriter; - private final LoaderIndexJournalWriter journalWriter; private final DomainLoaderService domainService; private final DomainLinksLoaderService linksService; private final KeywordLoaderService keywordLoaderService; @@ -79,7 +78,6 @@ public class LoaderMain extends ProcessMainClass { MessageQueueFactory messageQueueFactory, FileStorageService fileStorageService, DocumentDbWriter documentDbWriter, - LoaderIndexJournalWriter journalWriter, DomainLoaderService domainService, DomainLinksLoaderService linksService, KeywordLoaderService keywordLoaderService, @@ -92,7 +90,6 @@ public class LoaderMain extends ProcessMainClass { this.messageQueueFactory = messageQueueFactory; this.fileStorageService = fileStorageService; this.documentDbWriter = documentDbWriter; - this.journalWriter = journalWriter; this.domainService = domainService; this.linksService = linksService; this.keywordLoaderService = keywordLoaderService; @@ -106,7 +103,7 @@ public class LoaderMain extends ProcessMainClass { void run(LoadRequest instructions) { LoaderInputData inputData = instructions.getInputData(); - DomainIdRegistry domainIdRegistry = domainService.getOrCreateDomainIds(inputData); + DomainIdRegistry domainIdRegistry = domainService.getOrCreateDomainIds(heartbeat, inputData); try { var results = ForkJoinPool.commonPool() @@ -132,7 +129,7 @@ public class LoaderMain extends ProcessMainClass { logger.error("Error", ex); } finally { - journalWriter.close(); + keywordLoaderService.close(); documentDbWriter.close(); heartbeat.shutDown(); } diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java index 5909a9aa..d96f1149 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/DocumentLoaderService.java @@ -3,22 +3,22 @@ package nu.marginalia.loading.documents; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; import nu.marginalia.linkdb.docs.DocumentDbWriter; import nu.marginalia.linkdb.model.DocdbUrlDetail; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.EdgeUrl; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.processed.DocumentRecordMetadataProjection; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collection; import java.util.List; @Singleton @@ -38,18 +38,27 @@ public class DocumentLoaderService { LoaderInputData inputData) throws IOException, SQLException { - var documentFiles = inputData.listDocumentFiles(); + Collection> pageRefs = inputData.listDocumentFiles(); try (var taskHeartbeat = processHeartbeat.createAdHocTaskHeartbeat("DOCUMENTS")) { int processed = 0; - for (var file : documentFiles) { - taskHeartbeat.progress("LOAD", processed++, documentFiles.size()); + for (var pageRef : pageRefs) { + taskHeartbeat.progress("LOAD", processed++, pageRefs.size()); - loadDocumentsFromFile(domainIdRegistry, file); + try (var reader = new SlopDocumentRecord.MetadataReader(pageRef); + LinkdbLoader loader = new LinkdbLoader(domainIdRegistry)) + { + while (reader.hasMore()) { + loader.accept(reader.next()); + } + } } - taskHeartbeat.progress("LOAD", processed, documentFiles.size()); + taskHeartbeat.progress("LOAD", processed, pageRefs.size()); + } catch (IOException e) { + logger.error("Failed to load documents", e); + throw e; } logger.info("Finished"); @@ -57,19 +66,6 @@ public class DocumentLoaderService { return true; } - private void loadDocumentsFromFile(DomainIdRegistry domainIdRegistry, Path file) - throws SQLException, IOException - { - try (var stream = DocumentRecordParquetFileReader.streamMetadataProjection(file); - LinkdbLoader loader = new LinkdbLoader(domainIdRegistry) - ) - { - logger.info("Loading document meta from {}", file); - - stream.forEach(loader::accept); - } - } - class LinkdbLoader implements AutoCloseable { private final DomainIdRegistry domainIdRegistry; private final List details = new ArrayList<>(1000); @@ -79,25 +75,31 @@ public class DocumentLoaderService { } @SneakyThrows - public void accept(DocumentRecordMetadataProjection projection) + public void accept(SlopDocumentRecord.MetadataProjection projection) { long urlId = UrlIdCodec.encodeId( - domainIdRegistry.getDomainId(projection.domain), - projection.ordinal + domainIdRegistry.getDomainId(projection.domain()), + projection.ordinal() ); - details.add(new DocdbUrlDetail( + var parsedUrl = EdgeUrl.parse(projection.url()); + if (parsedUrl.isEmpty()) { + logger.error("Failed to parse URL: {}", projection.url()); + return; + } + + documentDbWriter.add(new DocdbUrlDetail( urlId, - new EdgeUrl(projection.url), - projection.title, - projection.description, - projection.quality, - projection.htmlStandard, - projection.htmlFeatures, - projection.pubYear, - projection.hash, - projection.getLength() + parsedUrl.get(), + projection.title(), + projection.description(), + projection.quality(), + projection.htmlStandard(), + projection.htmlFeatures(), + projection.pubYear(), + projection.hash(), + projection.length() )); if (details.size() > 100) { diff --git a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java index 516eb189..fadbd64c 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/documents/KeywordLoaderService.java @@ -2,19 +2,18 @@ package nu.marginalia.loading.documents; import com.google.inject.Inject; import com.google.inject.Singleton; -import nu.marginalia.io.processed.DocumentRecordParquetFileReader; -import nu.marginalia.keyword.model.DocumentKeywords; import nu.marginalia.loading.LoaderIndexJournalWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.processed.DocumentRecordKeywordsProjection; +import nu.marginalia.model.processed.SlopDocumentRecord; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.util.Collection; @Singleton public class KeywordLoaderService { @@ -31,47 +30,41 @@ public class KeywordLoaderService { LoaderInputData inputData) throws IOException { try (var task = heartbeat.createAdHocTaskHeartbeat("KEYWORDS")) { - var documentFiles = inputData.listDocumentFiles(); + Collection> documentFiles = inputData.listDocumentFiles(); int processed = 0; - for (var file : documentFiles) { + for (SlopTable.Ref pageRef : documentFiles) { task.progress("LOAD", processed++, documentFiles.size()); - loadKeywordsFromFile(domainIdRegistry, file); + try (var keywordsReader = new SlopDocumentRecord.KeywordsProjectionReader(pageRef)) { + logger.info("Loading keywords from {}", pageRef); + + while (keywordsReader.hasMore()) { + var projection = keywordsReader.next(); + + long combinedId = UrlIdCodec.encodeId( + domainIdRegistry.getDomainId(projection.domain()), + projection.ordinal()); + + writer.putWords(combinedId, projection); + } + } } task.progress("LOAD", processed, documentFiles.size()); } + catch (IOException e) { + logger.error("Failed to load keywords", e); + throw e; + } logger.info("Finished"); return true; } - private void loadKeywordsFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { - try (var stream = DocumentRecordParquetFileReader.streamKeywordsProjection(file)) { - logger.info("Loading keywords from {}", file); - stream.filter(DocumentRecordKeywordsProjection::hasKeywords) - .forEach(proj -> insertKeywords(domainIdRegistry, proj)); - } - } - - private void insertKeywords(DomainIdRegistry domainIdRegistry, - DocumentRecordKeywordsProjection projection) - { - long combinedId = UrlIdCodec.encodeId( - domainIdRegistry.getDomainId(projection.domain), - projection.ordinal); - - var words = new DocumentKeywords( - projection.words.toArray(String[]::new), - projection.metas.toArray() - ); - - writer.putWords(combinedId, - projection.htmlFeatures, - projection.documentMetadata, - words); + public void close() throws IOException { + writer.close(); } } \ No newline at end of file diff --git a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java index 6739f8e7..66389062 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/domains/DomainLoaderService.java @@ -4,13 +4,13 @@ import com.google.inject.Inject; import com.google.inject.Singleton; import com.zaxxer.hikari.HikariDataSource; import nu.marginalia.ProcessConfiguration; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; -import nu.marginalia.io.processed.DomainRecordParquetFileReader; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.model.processed.DomainWithIp; +import nu.marginalia.model.processed.SlopDomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainRecord; +import nu.marginalia.process.control.ProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeatImpl; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -18,7 +18,9 @@ import java.io.IOException; import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.SQLException; -import java.util.*; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; @Singleton public class DomainLoaderService { @@ -35,97 +37,134 @@ public class DomainLoaderService { this.nodeId = processConfiguration.node(); } - /** Read the domain names from each parquet file + enum Steps { + PREP_DATA, + UPDATE_AFFINITY_AND_IP, + FETCH_ALL, + DONE + } + + /** Read the domain names from each input file * compare with SQL domain database, fetch those * that exist, insert those that don't. */ - public DomainIdRegistry getOrCreateDomainIds(LoaderInputData inputData) + public DomainIdRegistry getOrCreateDomainIds(ProcessHeartbeatImpl heartbeat, LoaderInputData inputData) throws IOException, SQLException { Set domainNamesAll = new HashSet<>(100_000); DomainIdRegistry ret = new DomainIdRegistry(); try (var conn = dataSource.getConnection(); - var selectStmt = conn.prepareStatement(""" - SELECT ID FROM EC_DOMAIN WHERE DOMAIN_NAME=? - """) - ) { + var taskHeartbeat = heartbeat.createProcessTaskHeartbeat(Steps.class, "DOMAIN_IDS")) + { + taskHeartbeat.progress(Steps.PREP_DATA); - try (var inserter = new DomainInserter(conn, nodeId)) { - for (var domainWithIp : readBasicDomainInformation(inputData)) { - inserter.accept(new EdgeDomain(domainWithIp.domain)); - domainNamesAll.add(domainWithIp.domain); - } - } - try (var inserter = new DomainInserter(conn, -1)) { - for (var domain : readReferencedDomainNames(inputData)) { - inserter.accept(new EdgeDomain(domain)); - domainNamesAll.add(domain); + Collection> domainPageRefs = inputData.listDomainPages(); + Collection> domainLinkPageRefs = inputData.listDomainLinkPages(); + + // Ensure that the domains we've just crawled are in the domain database to this node + try (var inserter = new DomainInserter(conn, nodeId); + var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_CRAWLED_DOMAINS")) { + // Add domain names from this data set with the current node affinity + int pageIdx = 0; + + for (SlopTable.Ref page : inputData.listDomainPages()) { + processHeartbeat.progress("INSERT", pageIdx++, domainPageRefs.size()); + + try (var reader = new SlopDomainRecord.DomainNameReader(page)) { + while (reader.hasMore()) { + String domainName = reader.next(); + if (domainNamesAll.add(domainName)) { + inserter.accept(new EdgeDomain(domainName)); + } + } + } } } - try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId)) { - for (var domainWithIp : readBasicDomainInformation(inputData)) { - updater.accept(new EdgeDomain(domainWithIp.domain), domainWithIp.ip); + // Add domains that are linked to from the domains we've just crawled, but with -1 affinity meaning they + // can be grabbed by any index node + try (var inserter = new DomainInserter(conn, -1); + var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("INSERT_LINKED_DOMAINS")) { + // Add linked domains, but with -1 affinity meaning they can be grabbed by any index node + int pageIdx = 0; + + for (SlopTable.Ref page : inputData.listDomainLinkPages()) { + processHeartbeat.progress("INSERT", pageIdx++, domainLinkPageRefs.size()); + + try (var reader = new SlopDomainLinkRecord.Reader(page)) { + while (reader.hasMore()) { + SlopDomainLinkRecord record = reader.next(); + String domainName = record.dest(); + if (domainNamesAll.add(domainName)) { + inserter.accept(new EdgeDomain(domainName)); + } + } + } } } - selectStmt.setFetchSize(1000); - for (var domain : domainNamesAll) { - selectStmt.setString(1, domain); + taskHeartbeat.progress(Steps.UPDATE_AFFINITY_AND_IP); + + // Update the node affinity and IP address for each domain we have information about + try (var processHeartbeat = heartbeat.createAdHocTaskHeartbeat("UPDATE_AFFINITY_AND_IP")) { + // Update the node affinity and IP address for each domain + int pageIdx = 0; + + for (SlopTable.Ref page : inputData.listDomainPages()) { + processHeartbeat.progress("UPDATE", pageIdx++, domainPageRefs.size()); + + try (var updater = new DomainAffinityAndIpUpdater(conn, nodeId); + var reader = new SlopDomainRecord.DomainWithIpReader(page) + ) { + while (reader.hasMore()) { + var domainWithIp = reader.next(); + updater.accept(new EdgeDomain(domainWithIp.domain()), domainWithIp.ip()); + } + } + } + } + + taskHeartbeat.progress(Steps.FETCH_ALL); + + // Fetch the ID for all domains that we have information about + try (var selectStmt = conn.prepareStatement("SELECT ID, LOWER(DOMAIN_NAME) FROM EC_DOMAIN")) { + + selectStmt.setFetchSize(1000); + var rs = selectStmt.executeQuery(); - if (rs.next()) { - ret.add(domain, rs.getInt(1)); - } - else { - logger.error("Unknown domain {}", domain); + while (rs.next()) { + String domain = rs.getString(2); + + if (domainNamesAll.contains(domain)) { + ret.add(domain, rs.getInt(1)); + } } } + + taskHeartbeat.progress(Steps.DONE); } return ret; } - Collection readBasicDomainInformation(LoaderInputData inputData) throws IOException { - final Set domainsAll = new HashSet<>(100_000); - - var domainFiles = inputData.listDomainFiles(); - for (var file : domainFiles) { - domainsAll.addAll(DomainRecordParquetFileReader.getBasicDomainInformation(file)); - } - - return domainsAll; - } - - Collection readReferencedDomainNames(LoaderInputData inputData) throws IOException { - final Set domainNamesAll = new HashSet<>(100_000); - - var linkFiles = inputData.listDomainLinkFiles(); - for (var file : linkFiles) { - domainNamesAll.addAll(DomainLinkRecordParquetFileReader.getDestDomainNames(file)); - } - - return domainNamesAll; - } - - public boolean loadDomainMetadata(DomainIdRegistry domainIdRegistry, ProcessHeartbeatImpl heartbeat, LoaderInputData inputData) { - - var files = inputData.listDomainFiles(); + public boolean loadDomainMetadata(DomainIdRegistry domainIdRegistry, ProcessHeartbeat heartbeat, LoaderInputData inputData) { try (var taskHeartbeat = heartbeat.createAdHocTaskHeartbeat("UPDATE-META")) { int processed = 0; - for (var file : files) { - taskHeartbeat.progress("UPDATE-META", processed++, files.size()); + Collection> pages = inputData.listDomainPages(); + for (var page : pages) { + taskHeartbeat.progress("UPDATE-META", processed++, pages.size()); - try (var stream = DomainRecordParquetFileReader.stream(file); - var updater = new DomainMetadataUpdater(dataSource, domainIdRegistry) - ) { - stream.forEach(updater::accept); + try (var reader = new SlopDomainRecord.Reader(page); + var updater = new DomainMetadataUpdater(dataSource, domainIdRegistry)) + { + reader.forEach(updater::accept); } } - taskHeartbeat.progress("UPDATE-META", processed, files.size()); + taskHeartbeat.progress("UPDATE-META", processed, pages.size()); } catch (Exception ex) { logger.error("Failed inserting metadata!", ex); @@ -222,12 +261,12 @@ public class DomainLoaderService { """); } - public void accept(DomainRecord domainRecord) { + public void accept(SlopDomainRecord domainRecord) { try { - updateStatement.setInt(1, idRegistry.getDomainId(domainRecord.domain)); - updateStatement.setInt(2, domainRecord.visitedUrls); - updateStatement.setInt(3, domainRecord.goodUrls); - updateStatement.setInt(4, domainRecord.knownUrls); + updateStatement.setInt(1, idRegistry.getDomainId(domainRecord.domain())); + updateStatement.setInt(2, domainRecord.visitedUrls()); + updateStatement.setInt(3, domainRecord.goodUrls()); + updateStatement.setInt(4, domainRecord.knownUrls()); updateStatement.addBatch(); if (++i > 1000) { diff --git a/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java b/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java index 06bf4c95..bc4479d6 100644 --- a/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java +++ b/code/processes/loading-process/java/nu/marginalia/loading/links/DomainLinksLoaderService.java @@ -3,17 +3,17 @@ package nu.marginalia.loading.links; import com.google.inject.Inject; import com.google.inject.Singleton; import lombok.SneakyThrows; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileReader; import nu.marginalia.linkgraph.io.DomainLinksWriter; import nu.marginalia.loading.LoaderInputData; import nu.marginalia.loading.domains.DomainIdRegistry; -import nu.marginalia.model.processed.DomainLinkRecord; +import nu.marginalia.model.processed.SlopDomainLinkRecord; import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.slop.SlopTable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Path; +import java.util.Collection; @Singleton public class DomainLinksLoaderService { @@ -31,34 +31,35 @@ public class DomainLinksLoaderService { ProcessHeartbeat heartbeat, LoaderInputData inputData) throws IOException { - try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS")) { - var linkFiles = inputData.listDomainLinkFiles(); + try (var task = heartbeat.createAdHocTaskHeartbeat("LINKS"); + var linkLoader = new LinkLoader(domainIdRegistry)) + { + Collection> pageRefs = inputData.listDomainLinkPages(); int processed = 0; - for (var file : linkFiles) { - task.progress("LOAD", processed++, linkFiles.size()); + for (var pageRef : pageRefs) { + task.progress("LOAD", processed++, pageRefs.size()); - loadLinksFromFile(domainIdRegistry, file); + try (var domainLinkReader = new SlopDomainLinkRecord.Reader(pageRef)) + { + domainLinkReader.forEach(linkLoader::accept); + } } - task.progress("LOAD", processed, linkFiles.size()); + task.progress("LOAD", processed, pageRefs.size()); + } + catch (IOException e) { + logger.error("Failed to load links", e); + throw e; } logger.info("Finished"); return true; } - private void loadLinksFromFile(DomainIdRegistry domainIdRegistry, Path file) throws IOException { - try (var domainStream = DomainLinkRecordParquetFileReader.stream(file); - var linkLoader = new LinkLoader(domainIdRegistry)) - { - logger.info("Loading links from {}", file); - domainStream.forEach(linkLoader::accept); - } - } - class LinkLoader implements AutoCloseable { + private class LinkLoader implements AutoCloseable { private final DomainIdRegistry domainIdRegistry; public LinkLoader(DomainIdRegistry domainIdRegistry) { @@ -66,10 +67,10 @@ public class DomainLinksLoaderService { } @SneakyThrows - void accept(DomainLinkRecord record) { + void accept(String source, String dest) { domainLinkDbWriter.write( - domainIdRegistry.getDomainId(record.source), - domainIdRegistry.getDomainId(record.dest) + domainIdRegistry.getDomainId(source), + domainIdRegistry.getDomainId(dest) ); } diff --git a/code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java b/code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java deleted file mode 100644 index fda0e9b6..00000000 --- a/code/processes/loading-process/test/nu/marginalia/loading/domains/DomainLoaderServiceTest.java +++ /dev/null @@ -1,102 +0,0 @@ -package nu.marginalia.loading.domains; - -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.io.processed.DomainLinkRecordParquetFileWriter; -import nu.marginalia.io.processed.DomainRecordParquetFileWriter; -import nu.marginalia.io.processed.ProcessedDataFileNames; -import nu.marginalia.loading.LoaderInputData; -import nu.marginalia.model.processed.DomainLinkRecord; -import nu.marginalia.model.processed.DomainRecord; -import nu.marginalia.process.control.ProcessAdHocTaskHeartbeat; -import nu.marginalia.process.control.ProcessHeartbeat; -import org.junit.jupiter.api.*; -import org.mockito.Mockito; -import org.testcontainers.junit.jupiter.Testcontainers; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.*; -import java.util.stream.Collectors; - -import static org.junit.jupiter.api.Assertions.*; - -@Tag("slow") -@Testcontainers -class DomainLoaderServiceTest { - List toDelete = new ArrayList<>(); - ProcessHeartbeat heartbeat; - - @BeforeEach - public void setUp() { - heartbeat = Mockito.mock(ProcessHeartbeat.class); - - Mockito.when(heartbeat.createAdHocTaskHeartbeat(Mockito.anyString())).thenReturn( - Mockito.mock(ProcessAdHocTaskHeartbeat.class) - ); - } - - @AfterEach - public void tearDown() throws IOException { - for (var path : Lists.reverse(toDelete)) { - Files.deleteIfExists(path); - } - - toDelete.clear(); - } - - @Test - void readDomainNames() throws IOException { - Path workDir = Files.createTempDirectory(getClass().getSimpleName()); - Path parquetFile1 = ProcessedDataFileNames.domainFileName(workDir, 0); - Path parquetFile2 = ProcessedDataFileNames.domainFileName(workDir, 1); - Path parquetFile3 = ProcessedDataFileNames.domainLinkFileName(workDir, 0); - - toDelete.add(workDir); - toDelete.add(parquetFile1); - toDelete.add(parquetFile2); - toDelete.add(parquetFile3); - - // Prep by creating two parquet files with domains - // and one with domain links - - List domains1 = List.of("www.marginalia.nu", "memex.marginalia.nu", "search.marginalia.nu"); - List domains2 = List.of("wiby.me", "www.mojeek.com", "www.altavista.com"); - List linkDomains = List.of("maya.land", "xkcd.com", "aaronsw.com"); - - try (var pw = new DomainRecordParquetFileWriter(parquetFile1)) { - for (var domain : domains1) { - pw.write(dr(domain)); - } - } - try (var pw = new DomainRecordParquetFileWriter(parquetFile2)) { - for (var domain : domains2) { - pw.write(dr(domain)); - } - } - try (var pw = new DomainLinkRecordParquetFileWriter(parquetFile3)) { - for (var domain : linkDomains) { - pw.write(dl(domain)); - } - } - // Read them - var domainService = new DomainLoaderService(null, new ProcessConfiguration("test", 1, UUID.randomUUID())); - - // Verify - Set expectedDomains1 = Sets.union(new HashSet<>(domains1), new HashSet<>(domains2)); - assertEquals(expectedDomains1, domainService.readBasicDomainInformation(new LoaderInputData(workDir, 2)).stream().map(d -> d.domain).collect(Collectors.toSet())); - - Set expectedDomains2 = new HashSet<>(linkDomains); - assertEquals(expectedDomains2, domainService.readReferencedDomainNames(new LoaderInputData(workDir, 2))); - } - - private DomainRecord dr(String domainName) { - return new DomainRecord(domainName, 0, 0, 0, null, null, null, null); - } - - private DomainLinkRecord dl(String destDomainName) { - return new DomainLinkRecord("www.marginalia.nu", destDomainName); - } -} \ No newline at end of file diff --git a/code/processes/loading-process/test/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java b/code/processes/loading-process/test/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java deleted file mode 100644 index 568981c6..00000000 --- a/code/processes/loading-process/test/nu/marginalia/loading/loader/LoaderIndexJournalWriterTest.java +++ /dev/null @@ -1,82 +0,0 @@ -package nu.marginalia.loading.loader; - -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.storage.model.FileStorageBase; -import nu.marginalia.storage.model.FileStorageBaseType; -import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile; -import nu.marginalia.keyword.model.DocumentKeywords; -import nu.marginalia.loading.LoaderIndexJournalWriter; -import nu.marginalia.model.idx.DocumentMetadata; -import nu.marginalia.index.journal.IndexJournalFileNames; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; -import org.mockito.Mockito; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.sql.SQLException; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.LongStream; - -import static org.junit.jupiter.api.Assertions.*; - -class LoaderIndexJournalWriterTest { - - Path tempDir; - LoaderIndexJournalWriter writer; - @BeforeEach - public void setUp() throws IOException, SQLException { - tempDir = Files.createTempDirectory(getClass().getSimpleName()); - FileStorageService storageService = Mockito.mock(FileStorageService.class); - - Mockito.when(storageService.getStorageBase(FileStorageBaseType.CURRENT)).thenReturn(new FileStorageBase(null, null, 1,null, tempDir.toString())); - - writer = new LoaderIndexJournalWriter(storageService); - } - - @AfterEach - public void tearDown() throws Exception { - writer.close(); - List junk = Files.list(tempDir.resolve("iw")).toList(); - for (var item : junk) - Files.delete(item); - Files.delete(tempDir.resolve("iw")); - Files.delete(tempDir); - } - - @Test - public void testBreakup() throws Exception { - String[] keywords = new String[2000]; - long[] metadata = new long[2000]; - for (int i = 0; i < 2000; i++) { - keywords[i] = Integer.toString(i); - metadata[i] = i+1; - } - DocumentKeywords words = new DocumentKeywords(keywords, metadata); - writer.putWords(1, 0, new DocumentMetadata(0), - words); - - writer.close(); - - List journalFiles = IndexJournalFileNames.findJournalFiles(tempDir.resolve("iw")); - assertEquals(1, journalFiles.size()); - - var reader = new IndexJournalReaderSingleFile(journalFiles.get(0)); - List docIds = new ArrayList<>(); - reader.forEachDocId(docIds::add); - assertEquals(List.of(1L, 1L), docIds); - - List metas = new ArrayList(); - var ptr = reader.newPointer(); - while (ptr.nextDocument()) { - while (ptr.nextRecord()) { - metas.add(ptr.wordMeta()); - } - } - - assertEquals(LongStream.of(metadata).boxed().toList(), metas); - } -} \ No newline at end of file diff --git a/code/process-mqapi/build.gradle b/code/processes/process-mq-api/build.gradle similarity index 91% rename from code/process-mqapi/build.gradle rename to code/processes/process-mq-api/build.gradle index 339c52c8..b6881432 100644 --- a/code/process-mqapi/build.gradle +++ b/code/processes/process-mq-api/build.gradle @@ -11,6 +11,8 @@ java { } } +jar.archiveBaseName = 'process-mqapi' + apply from: "$rootProject.projectDir/srcsets.gradle" dependencies { diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/ProcessInboxNames.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/ProcessInboxNames.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/ProcessInboxNames.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertAction.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertAction.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertAction.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertAction.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/converting/ConvertRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/converting/ConvertRequest.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/crawling/CrawlRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/crawling/CrawlRequest.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/index/CreateIndexRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/index/CreateIndexRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/index/CreateIndexRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/index/CreateIndexRequest.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/index/IndexName.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/index/IndexName.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/index/IndexName.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/index/IndexName.java diff --git a/code/process-mqapi/java/nu/marginalia/mqapi/loading/LoadRequest.java b/code/processes/process-mq-api/java/nu/marginalia/mqapi/loading/LoadRequest.java similarity index 100% rename from code/process-mqapi/java/nu/marginalia/mqapi/loading/LoadRequest.java rename to code/processes/process-mq-api/java/nu/marginalia/mqapi/loading/LoadRequest.java diff --git a/code/processes/website-adjacencies-calculator/build.gradle b/code/processes/website-adjacencies-calculator/build.gradle index d983cf2d..37787b6a 100644 --- a/code/processes/website-adjacencies-calculator/build.gradle +++ b/code/processes/website-adjacencies-calculator/build.gradle @@ -42,6 +42,7 @@ dependencies { testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/readme.md b/code/readme.md index f89805c5..ac65deb6 100644 --- a/code/readme.md +++ b/code/readme.md @@ -71,8 +71,6 @@ Features are relatively stand-alone components that serve some part of the domai but isolated. * [features-search](features-search) -* [features-crawl](features-crawl) -* [features-convert](features-convert) ### Libraries and primitives diff --git a/code/services-application/api-service/build.gradle b/code/services-application/api-service/build.gradle index 848091f3..0680f59e 100644 --- a/code/services-application/api-service/build.gradle +++ b/code/services-application/api-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } java { @@ -52,6 +52,7 @@ dependencies { testImplementation libs.bundles.junit testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java b/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java index e979b86f..95145de3 100644 --- a/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java +++ b/code/services-application/api-service/java/nu/marginalia/api/ApiSearchOperator.java @@ -8,9 +8,9 @@ import nu.marginalia.api.model.ApiSearchResults; import nu.marginalia.api.searchquery.QueryClient; import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.query.SearchSetIdentifier; -import nu.marginalia.api.searchquery.model.results.*; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; import nu.marginalia.index.query.limit.QueryLimits; -import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.model.idx.WordFlags; import java.util.ArrayList; import java.util.Comparator; @@ -77,14 +77,8 @@ public class ApiSearchOperator { if (url.rawIndexResult != null) { List lst = new ArrayList<>(); for (var entry : url.rawIndexResult.keywordScores) { - var metadata = new WordMetadata(entry.encodedWordMetadata()); - - // Skip terms that don't appear anywhere - if (metadata.isEmpty()) - continue; - - Set flags = metadata.flagSet().stream().map(Object::toString).collect(Collectors.toSet()); - lst.add(new ApiSearchResultQueryDetails(entry.keyword, Long.bitCount(metadata.positions()), flags)); + Set flags = WordFlags.decode(entry.flags).stream().map(Object::toString).collect(Collectors.toSet()); + lst.add(new ApiSearchResultQueryDetails(entry.keyword, entry.positionCount, flags)); } details.add(lst); diff --git a/code/services-application/dating-service/build.gradle b/code/services-application/dating-service/build.gradle index 7ada938f..e8da6e4e 100644 --- a/code/services-application/dating-service/build.gradle +++ b/code/services-application/dating-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { diff --git a/code/services-application/explorer-service/build.gradle b/code/services-application/explorer-service/build.gradle index d2d75348..d9442ebd 100644 --- a/code/services-application/explorer-service/build.gradle +++ b/code/services-application/explorer-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { diff --git a/code/services-application/explorer-service/readme.md b/code/services-application/explorer-service/readme.md index cf258eb8..567886d6 100644 --- a/code/services-application/explorer-service/readme.md +++ b/code/services-application/explorer-service/readme.md @@ -8,4 +8,4 @@ Externally the service is available at [https://explore2.marginalia.nu/](https:/ * [features-search/screenshots](../../features-search/screenshots) * [features-search/random-websites](../../features-search/random-websites) -* [tools/website-adjacencies-calculator](../../tools/website-adjacencies-calculator) +* [processes/website-adjacencies-calculator](../../processes/website-adjacencies-calculator) diff --git a/code/services-application/search-service/build.gradle b/code/services-application/search-service/build.gradle index 1e9f527d..998b7f26 100644 --- a/code/services-application/search-service/build.gradle +++ b/code/services-application/search-service/build.gradle @@ -5,7 +5,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { @@ -80,6 +80,7 @@ dependencies { testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java b/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java index fe283471..37b9893d 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchMain.java @@ -33,6 +33,7 @@ public class SearchMain extends MainClass { new ServiceDiscoveryModule(), new DatabaseModule(false) ); + // Orchestrate the boot order for the services var registry = injector.getInstance(ServiceRegistryIf.class); diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java index a7a4a76b..80c6aa9f 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchOperator.java @@ -7,14 +7,20 @@ import nu.marginalia.WebsiteUrl; import nu.marginalia.api.math.MathClient; import nu.marginalia.api.searchquery.QueryClient; import nu.marginalia.api.searchquery.model.query.QueryResponse; +import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; +import nu.marginalia.bbpc.BrailleBlockPunchCards; import nu.marginalia.db.DbDomainQueries; +import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawl.DomainIndexingState; import nu.marginalia.search.command.SearchParameters; import nu.marginalia.search.model.ClusteredUrlDetails; import nu.marginalia.search.model.DecoratedSearchResults; import nu.marginalia.search.model.SearchFilters; import nu.marginalia.search.model.UrlDetails; -import nu.marginalia.search.svc.SearchQueryIndexService; +import nu.marginalia.search.results.UrlDeduplicator; +import nu.marginalia.search.svc.SearchQueryCountService; import nu.marginalia.search.svc.SearchUnitConversionService; import org.apache.logging.log4j.util.Strings; import org.slf4j.Logger; @@ -23,9 +29,11 @@ import org.slf4j.Marker; import org.slf4j.MarkerFactory; import javax.annotation.Nullable; -import java.lang.ref.WeakReference; import java.time.Duration; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; @@ -41,30 +49,30 @@ public class SearchOperator { private final MathClient mathClient; private final DbDomainQueries domainQueries; private final QueryClient queryClient; - private final SearchQueryIndexService searchQueryService; private final SearchQueryParamFactory paramFactory; private final WebsiteUrl websiteUrl; private final SearchUnitConversionService searchUnitConversionService; + private final SearchQueryCountService searchVisitorCount; @Inject public SearchOperator(MathClient mathClient, DbDomainQueries domainQueries, QueryClient queryClient, - SearchQueryIndexService searchQueryService, SearchQueryParamFactory paramFactory, WebsiteUrl websiteUrl, - SearchUnitConversionService searchUnitConversionService) + SearchUnitConversionService searchUnitConversionService, + SearchQueryCountService searchVisitorCount + ) { this.mathClient = mathClient; this.domainQueries = domainQueries; this.queryClient = queryClient; - - this.searchQueryService = searchQueryService; this.paramFactory = paramFactory; this.websiteUrl = websiteUrl; this.searchUnitConversionService = searchUnitConversionService; + this.searchVisitorCount = searchVisitorCount; } public List doSiteSearch(String domain, @@ -74,7 +82,7 @@ public class SearchOperator { var queryParams = paramFactory.forSiteSearch(domain, domainId, count); var queryResponse = queryClient.search(queryParams); - return searchQueryService.getResultsFromQuery(queryResponse); + return getResultsFromQuery(queryResponse); } public List doBacklinkSearch(String domain) { @@ -82,64 +90,49 @@ public class SearchOperator { var queryParams = paramFactory.forBacklinkSearch(domain); var queryResponse = queryClient.search(queryParams); - return searchQueryService.getResultsFromQuery(queryResponse); + return getResultsFromQuery(queryResponse); } public List doLinkSearch(String source, String dest) { var queryParams = paramFactory.forLinkSearch(source, dest); var queryResponse = queryClient.search(queryParams); - return searchQueryService.getResultsFromQuery(queryResponse); + return getResultsFromQuery(queryResponse); } - private volatile WeakReference> oldResults = new WeakReference<>(Collections.emptyList()); - public DecoratedSearchResults doSearch(SearchParameters userParams) { + // The full user-facing search query does additional work to try to evaluate the query + // e.g. as a unit conversion query. This is done in parallel with the regular search. Future eval = searchUnitConversionService.tryEval(userParams.query()); - List clusteredResults; - QueryResponse queryResponse; - List problems; - String evalResult; - String focusDomain; + // Perform the regular search - if (userParams.poisonResults() && Math.random() > 0.1) { + var queryParams = paramFactory.forRegularSearch(userParams); + QueryResponse queryResponse = queryClient.search(queryParams); + var queryResults = getResultsFromQuery(queryResponse); - // For botnet users, we return random old query results. This is to make - // it harder for them to figure out if they are being rate limited. + // Cluster the results based on the query response + List clusteredResults = SearchResultClusterer + .selectStrategy(queryResponse) + .clusterResults(queryResults, 25); - clusteredResults = new ArrayList<>(Objects.requireNonNullElse(oldResults.get(), List.of())); + // Log the query and results - // Shuffle the results to make it harder to distinguish - Collections.shuffle(clusteredResults); + logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ',')); + logger.info(queryMarker, "Search Result Count: {}", queryResults.size()); - problems = List.of(); - evalResult = ""; - focusDomain = ""; - } else { - var queryParams = paramFactory.forRegularSearch(userParams); - queryResponse = queryClient.search(queryParams); - var queryResults = searchQueryService.getResultsFromQuery(queryResponse); + // Get the evaluation result and other data to return to the user + String evalResult = getFutureOrDefault(eval, ""); - logger.info(queryMarker, "Human terms: {}", Strings.join(queryResponse.searchTermsHuman(), ',')); - logger.info(queryMarker, "Search Result Count: {}", queryResults.size()); + String focusDomain = queryResponse.domain(); + int focusDomainId = focusDomain == null + ? -1 + : domainQueries.tryGetDomainId(new EdgeDomain(focusDomain)).orElse(-1); - evalResult = getFutureOrDefault(eval, ""); - - clusteredResults = SearchResultClusterer - .selectStrategy(queryResponse) - .clusterResults(queryResults, 25); - - focusDomain = queryResponse.domain(); - problems = getProblems(evalResult, queryResults, queryResponse); - - if (userParams.poisonResults()) { - // Save the results to feed to the botnet - oldResults = new WeakReference<>(clusteredResults); - } - } + List problems = getProblems(evalResult, queryResults, queryResponse); + // Return the results to the user return DecoratedSearchResults.builder() .params(userParams) .problems(problems) @@ -147,16 +140,114 @@ public class SearchOperator { .results(clusteredResults) .filters(new SearchFilters(websiteUrl, userParams)) .focusDomain(focusDomain) - .focusDomainId(getDomainId(focusDomain)) + .focusDomainId(focusDomainId) .build(); } + + public List getResultsFromQuery(QueryResponse queryResponse) { + final QueryLimits limits = queryResponse.specs().queryLimits; + final UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); + + // Update the query count (this is what you see on the front page) + searchVisitorCount.registerQuery(); + + return queryResponse.results().stream() + .filter(deduplicator::shouldRetain) + .limit(limits.resultsTotal()) + .map(SearchOperator::createDetails) + .toList(); + } + + private static UrlDetails createDetails(DecoratedSearchResultItem item) { + return new UrlDetails( + item.documentId(), + item.domainId(), + cleanUrl(item.url), + item.title, + item.description, + item.format, + item.features, + DomainIndexingState.ACTIVE, + item.rankingScore, // termScore + item.resultsFromDomain, + BrailleBlockPunchCards.printBits(item.bestPositions, 64), + Long.bitCount(item.bestPositions), + item.rawIndexResult, + item.rawIndexResult.keywordScores + ); + } + + /** Replace nuisance domains with replacements where available */ + private static EdgeUrl cleanUrl(EdgeUrl url) { + String topdomain = url.domain.topDomain; + String subdomain = url.domain.subDomain; + String path = url.path; + + if (topdomain.equals("fandom.com")) { + int wikiIndex = path.indexOf("/wiki/"); + if (wikiIndex >= 0) { + return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null); + } + } + else if (topdomain.equals("medium.com")) { + if (!subdomain.isBlank()) { + return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null); + } + else { + String article = path.substring(path.indexOf("/", 1)); + return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null); + } + + } + return url; + } + + @SneakyThrows + private List getProblems(String evalResult, List queryResults, QueryResponse response) { + + // We don't debug the query if it's a site search + if (response.domain() == null) + return List.of(); + + final List problems = new ArrayList<>(response.problems()); + + if (queryResults.size() <= 5 && null == evalResult) { + problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results."); + + // Try to spell check the search terms + var suggestions = getFutureOrDefault( + mathClient.spellCheck(response.searchTermsHuman()), + Map.of() + ); + + suggestions.forEach((term, suggestion) -> { + if (suggestion.size() > 1) { + String suggestionsStr = "\"%s\" could be spelled %s".formatted(term, suggestion.stream().map(s -> "\"" + s + "\"").collect(Collectors.joining(", "))); + problems.add(suggestionsStr); + } + }); + } + + Set representativeKeywords = response.getAllKeywords(); + if (representativeKeywords.size() > 1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning"))) + { + problems.add("Tip: Try using a query that looks like define:word if you want a dictionary definition"); + } + + return problems; + } + private T getFutureOrDefault(@Nullable Future fut, T defaultValue) { + return getFutureOrDefault(fut, Duration.ofMillis(50), defaultValue); + } + + private T getFutureOrDefault(@Nullable Future fut, Duration timeout, T defaultValue) { if (fut == null || fut.isCancelled()) { return defaultValue; } try { - return fut.get(50, TimeUnit.MILLISECONDS); + return fut.get(timeout.toMillis(), TimeUnit.MILLISECONDS); } catch (Exception ex) { logger.warn("Error fetching eval result", ex); @@ -164,56 +255,4 @@ public class SearchOperator { } } - private int getDomainId(String domain) { - if (domain == null) { - return -1; - } - - return domainQueries.tryGetDomainId(new EdgeDomain(domain)).orElse(-1); - } - - private List getProblems(String evalResult, List queryResults, QueryResponse response) { - final List problems = new ArrayList<>(response.problems()); - boolean siteSearch = response.domain() != null; - - if (!siteSearch) { - if (queryResults.size() <= 5 && null == evalResult) { - spellCheckTerms(response); - } - - if (queryResults.size() <= 5) { - problems.add("Try rephrasing the query, changing the word order or using synonyms to get different results. Tips."); - } - - Set representativeKeywords = response.getAllKeywords(); - if (representativeKeywords.size()>1 && (representativeKeywords.contains("definition") || representativeKeywords.contains("define") || representativeKeywords.contains("meaning"))) - { - problems.add("Tip: Try using a query that looks like define:word if you want a dictionary definition"); - } - } - - return problems; - } - - - @SneakyThrows - private void spellCheckTerms(QueryResponse response) { - var suggestions = mathClient - .spellCheck(response.searchTermsHuman(), Duration.ofMillis(20)); - - suggestions.entrySet() - .stream() - .filter(e -> e.getValue().size() > 1) - .map(e -> searchTermToProblemDescription(e.getKey(), e.getValue())) - .forEach(response.problems()::add); - } - - private String searchTermToProblemDescription(String term, List suggestions) { - String suggestionsStr = suggestions.stream().map(s -> STR."\"\{s}\"").collect(Collectors.joining(", ")); - - return STR."\"\{term}\" could be spelled \{suggestionsStr}"; - } - - - } diff --git a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java index 410a4c07..9fd94e63 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java +++ b/code/services-application/search-service/java/nu/marginalia/search/SearchQueryParamFactory.java @@ -21,7 +21,6 @@ public class SearchQueryParamFactory { userParams.js().addTacitTerms(prototype); userParams.adtech().addTacitTerms(prototype); - return new QueryParams( userParams.query(), null, @@ -81,7 +80,7 @@ public class SearchQueryParamFactory { } public QueryParams forLinkSearch(String sourceDomain, String destDomain) { - return new QueryParams(STR."site:\{sourceDomain} links:\{destDomain}", + return new QueryParams("site:" + sourceDomain + " links:" + destDomain, null, List.of(), List.of(), diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java index faba9eb7..be1f4c2a 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/ClusteredUrlDetails.java @@ -38,9 +38,6 @@ public class ClusteredUrlDetails implements Comparable { for (var keywordScore : urlDetail.resultItem.keywordScores) { if (keywordScore.isKeywordSpecial()) continue; - if (keywordScore.positions() == 0) - continue; - if (keywordScore.hasTermFlag(WordFlags.Title)) return false; if (keywordScore.hasTermFlag(WordFlags.ExternalLink)) diff --git a/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java b/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java index 41e152e6..e38d5692 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java +++ b/code/services-application/search-service/java/nu/marginalia/search/model/UrlDetails.java @@ -9,7 +9,6 @@ import nu.marginalia.model.crawl.HtmlFeature; import java.util.ArrayList; import java.util.List; -import java.util.StringJoiner; /** A class to hold details about a single search result. */ @AllArgsConstructor @NoArgsConstructor @With @Getter @ToString diff --git a/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java b/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java index ccddb8d9..046b779e 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java +++ b/code/services-application/search-service/java/nu/marginalia/search/results/UrlDeduplicator.java @@ -24,7 +24,7 @@ public class UrlDeduplicator { this.resultsPerKey = resultsPerKey; } - public synchronized boolean shouldRemove(DecoratedSearchResultItem details) { + public boolean shouldRemove(DecoratedSearchResultItem details) { if (!deduplicateOnSuperficialHash(details)) return true; if (!deduplicateOnLSH(details)) @@ -35,6 +35,10 @@ public class UrlDeduplicator { return false; } + public boolean shouldRetain(DecoratedSearchResultItem details) { + return !shouldRemove(details); + } + private boolean deduplicateOnSuperficialHash(DecoratedSearchResultItem details) { return seenSuperficialhashes.add(Objects.hash(details.url.path, details.title)); } diff --git a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java index 6c829d67..e69de29b 100644 --- a/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java +++ b/code/services-application/search-service/java/nu/marginalia/search/svc/SearchQueryIndexService.java @@ -1,131 +0,0 @@ -package nu.marginalia.search.svc; - -import com.google.inject.Inject; -import com.google.inject.Singleton; -import lombok.SneakyThrows; -import nu.marginalia.api.searchquery.model.query.QueryResponse; -import nu.marginalia.api.searchquery.model.query.SearchSpecification; -import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem; -import nu.marginalia.bbpc.BrailleBlockPunchCards; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.model.crawl.DomainIndexingState; -import nu.marginalia.search.model.UrlDetails; -import nu.marginalia.search.results.UrlDeduplicator; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.Marker; -import org.slf4j.MarkerFactory; - -import java.util.ArrayList; -import java.util.Comparator; -import java.util.List; - -@Singleton -public class SearchQueryIndexService { - private final SearchQueryCountService searchVisitorCount; - private final Marker queryMarker = MarkerFactory.getMarker("QUERY"); - private final Logger logger = LoggerFactory.getLogger(getClass()); - - @Inject - public SearchQueryIndexService(SearchQueryCountService searchVisitorCount) { - this.searchVisitorCount = searchVisitorCount; - } - - public List getResultsFromQuery(QueryResponse queryResponse) { - // Remove duplicates and other chaff - final var results = limitAndDeduplicateResults(queryResponse.specs(), queryResponse.results()); - - // Update the query count (this is what you see on the front page) - searchVisitorCount.registerQuery(); - - // Decorate and sort the results - List urlDetails = getAllUrlDetails(results); - - urlDetails.sort(Comparator.naturalOrder()); - - return urlDetails; - } - - private List limitAndDeduplicateResults(SearchSpecification specs, List decoratedResults) { - var limits = specs.queryLimits; - - UrlDeduplicator deduplicator = new UrlDeduplicator(limits.resultsByDomain()); - List retList = new ArrayList<>(limits.resultsTotal()); - - int dedupCount = 0; - for (var item : decoratedResults) { - if (retList.size() >= limits.resultsTotal()) - break; - - if (!deduplicator.shouldRemove(item)) { - retList.add(item); - } - else { - dedupCount ++; - } - } - - if (dedupCount > 0) { - logger.info(queryMarker, "Deduplicator ate {} results", dedupCount); - } - - return retList; - } - - - @SneakyThrows - public List getAllUrlDetails(List resultSet) { - List ret = new ArrayList<>(resultSet.size()); - - for (var detail : resultSet) { - ret.add(new UrlDetails( - detail.documentId(), - detail.domainId(), - cleanUrl(detail.url), - detail.title, - detail.description, - detail.format, - detail.features, - DomainIndexingState.ACTIVE, - detail.rankingScore, // termScore - detail.resultsFromDomain(), - getPositionsString(detail), - Long.bitCount(detail.bestPositions), - detail.rawIndexResult, - detail.rawIndexResult.keywordScores - )); - } - - return ret; - } - - private EdgeUrl cleanUrl(EdgeUrl url) { - String topdomain = url.domain.topDomain; - String subdomain = url.domain.subDomain; - String path = url.path; - - if (topdomain.equals("fandom.com")) { - int wikiIndex = path.indexOf("/wiki/"); - if (wikiIndex >= 0) { - return new EdgeUrl("https", new EdgeDomain("breezewiki.com"), null, "/" + subdomain + path.substring(wikiIndex), null); - } - } - else if (topdomain.equals("medium.com")) { - if (!subdomain.isBlank()) { - return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, path, null); - } - else { - String article = path.substring(path.indexOf("/", 1)); - return new EdgeUrl("https", new EdgeDomain("scribe.rip"), null, article, null); - } - - } - return url; - } - - private String getPositionsString(DecoratedSearchResultItem resultItem) { - return BrailleBlockPunchCards.printBits(resultItem.bestPositions, 56); - - } -} diff --git a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java index 2a2cc003..8ccc5826 100644 --- a/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java +++ b/code/services-application/search-service/test/nu/marginalia/search/paperdoll/SearchServicePaperDoll.java @@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule { long positions) { results.add(new DecoratedSearchResultItem( - new SearchResultItem(url.hashCode(), 2, 3, false), + new SearchResultItem(url.hashCode(), 2, 3, score, 0), new EdgeUrl(url), title, description, @@ -103,6 +103,7 @@ public class SearchServicePaperDoll extends AbstractModule { 400, positions, score, + 4, null) ); } diff --git a/code/services-core/assistant-service/build.gradle b/code/services-core/assistant-service/build.gradle index a892d0aa..1dd2cfd6 100644 --- a/code/services-core/assistant-service/build.gradle +++ b/code/services-core/assistant-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { diff --git a/code/services-core/control-service/build.gradle b/code/services-core/control-service/build.gradle index 3edbf641..20db2bed 100644 --- a/code/services-core/control-service/build.gradle +++ b/code/services-core/control-service/build.gradle @@ -2,7 +2,7 @@ plugins { id 'java' id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } java { @@ -35,12 +35,12 @@ dependencies { implementation project(':code:functions:search-query:api') implementation project(':code:execution:api') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:features-search:screenshots') implementation project(':code:index:index-journal') implementation project(':code:index:query') - implementation project(':code:process-models:crawl-spec') + implementation project(':code:processes:crawling-process:model') implementation libs.bundles.slf4j @@ -69,6 +69,7 @@ dependencies { testImplementation libs.bundles.junit testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java index 5427abf9..32267155 100644 --- a/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java +++ b/code/services-core/control-service/java/nu/marginalia/control/node/svc/ControlNodeService.java @@ -1,10 +1,8 @@ package nu.marginalia.control.node.svc; -import com.google.common.base.Strings; import com.google.inject.Inject; import com.zaxxer.hikari.HikariDataSource; import lombok.SneakyThrows; -import nu.marginalia.service.ServiceMonitors; import nu.marginalia.control.ControlRendererFactory; import nu.marginalia.control.RedirectControl; import nu.marginalia.control.Redirects; @@ -12,11 +10,12 @@ import nu.marginalia.control.node.model.*; import nu.marginalia.control.sys.model.EventLogEntry; import nu.marginalia.control.sys.svc.EventLogService; import nu.marginalia.control.sys.svc.HeartbeatService; +import nu.marginalia.executor.client.ExecutorClient; import nu.marginalia.nodecfg.NodeConfigurationService; import nu.marginalia.nodecfg.model.NodeConfiguration; -import nu.marginalia.storage.FileStorageService; -import nu.marginalia.executor.client.ExecutorClient; import nu.marginalia.service.ServiceId; +import nu.marginalia.service.ServiceMonitors; +import nu.marginalia.storage.FileStorageService; import nu.marginalia.storage.model.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,16 +23,10 @@ import spark.Request; import spark.Response; import spark.Spark; -import javax.annotation.Nullable; -import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStreamReader; import java.nio.file.Path; -import java.sql.DriverManager; -import java.sql.ResultSet; import java.sql.SQLException; import java.util.*; -import java.util.stream.Stream; public class ControlNodeService { private final FileStorageService fileStorageService; @@ -403,7 +396,7 @@ public class ControlNodeService { // Sort by timestamp, then by relPath // this ensures that the newest file is listed last items.sort(Comparator - .comparing(FileStorageWithActions::getTimestamp) + .comparing(FileStorageWithActions::getTimestampFull) .thenComparing(FileStorageWithActions::getRelPath) ); diff --git a/code/services-core/executor-service/build.gradle b/code/services-core/executor-service/build.gradle index 7693083b..2e7934bc 100644 --- a/code/services-core/executor-service/build.gradle +++ b/code/services-core/executor-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { @@ -45,15 +45,13 @@ dependencies { implementation project(':code:functions:link-graph:api') - implementation project(':code:process-models:crawl-spec') - implementation project(':code:process-models:crawling-model') - implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-convert:data-extractors') - implementation project(':code:features-convert:stackexchange-xml') - implementation project(':code:features-convert:reddit-json') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:execution:data-extractors') implementation project(':code:index:index-journal') implementation project(':code:index:api') - implementation project(':code:process-mqapi') + implementation project(':code:processes:process-mq-api') implementation project(':code:execution') implementation project(':code:execution:api') @@ -94,6 +92,7 @@ dependencies { testImplementation libs.mockito testImplementation platform('org.testcontainers:testcontainers-bom:1.17.4') + testImplementation libs.commons.codec testImplementation 'org.testcontainers:mariadb:1.17.4' testImplementation 'org.testcontainers:junit-jupiter:1.17.4' testImplementation project(':code:libraries:test-helpers') diff --git a/code/services-core/index-service/build.gradle b/code/services-core/index-service/build.gradle index 7c7b1e0a..df3773f9 100644 --- a/code/services-core/index-service/build.gradle +++ b/code/services-core/index-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { diff --git a/code/services-core/query-service/build.gradle b/code/services-core/query-service/build.gradle index 70a3738e..70c71826 100644 --- a/code/services-core/query-service/build.gradle +++ b/code/services-core/query-service/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } application { diff --git a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java index 152f6a78..937b80d7 100644 --- a/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java +++ b/code/services-core/query-service/java/nu/marginalia/query/QueryBasicInterface.java @@ -3,12 +3,12 @@ package nu.marginalia.query; import com.google.common.base.Strings; import com.google.gson.Gson; import com.google.inject.Inject; +import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.api.searchquery.model.results.Bm25Parameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.functions.searchquery.QueryGRPCService; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.model.gson.GsonFactory; -import nu.marginalia.api.searchquery.model.query.QueryParams; import nu.marginalia.renderer.MustacheRenderer; import nu.marginalia.renderer.RendererFactory; import spark.Request; @@ -48,10 +48,9 @@ public class QueryBasicInterface { domainCount, count, 250, 8192 ), set); - var detailedDirectResult = queryGRPCService.executeDirect(queryParams, - params, - ResultRankingParameters.sensibleDefaults(), - count); + var detailedDirectResult = queryGRPCService.executeDirect( + queryParams, params, ResultRankingParameters.sensibleDefaults() + ); var results = detailedDirectResult.result(); @@ -83,12 +82,11 @@ public class QueryBasicInterface { domainCount, count, 250, 8192 ), set); - var rankingParams = rankingParamsFromRequest(request); + var rankingParams = debugRankingParamsFromRequest(request); - var detailedDirectResult = queryGRPCService.executeDirect(queryString, - queryParams, - rankingParams, - count); + var detailedDirectResult = queryGRPCService.executeDirect( + queryString, queryParams, rankingParams + ); var results = detailedDirectResult.result(); @@ -100,7 +98,7 @@ public class QueryBasicInterface { ); } - private ResultRankingParameters rankingParamsFromRequest(Request request) { + private ResultRankingParameters debugRankingParamsFromRequest(Request request) { var sensibleDefaults = ResultRankingParameters.sensibleDefaults(); return ResultRankingParameters.builder() @@ -108,23 +106,18 @@ public class QueryBasicInterface { .qualityPenalty(doubleFromRequest(request, "qualityPenalty", sensibleDefaults.qualityPenalty)) .shortDocumentThreshold(intFromRequest(request, "shortDocumentThreshold", sensibleDefaults.shortDocumentThreshold)) .shortDocumentPenalty(doubleFromRequest(request, "shortDocumentPenalty", sensibleDefaults.shortDocumentPenalty)) - .tcfJaccardWeight(doubleFromRequest(request, "tcfJaccardWeight", sensibleDefaults.tcfJaccardWeight)) - .tcfOverlapWeight(doubleFromRequest(request, "tcfOverlapWeight", sensibleDefaults.tcfOverlapWeight)) - .fullParams(new Bm25Parameters( - doubleFromRequest(request, "fullParams.k1", sensibleDefaults.fullParams.k()), - doubleFromRequest(request, "fullParams.b", sensibleDefaults.fullParams.b()) - )) - .prioParams(new Bm25Parameters( - doubleFromRequest(request, "prioParams.k1", sensibleDefaults.prioParams.k()), - doubleFromRequest(request, "prioParams.b", sensibleDefaults.prioParams.b()) + .tcfFirstPosition(doubleFromRequest(request, "tcfFirstPosition", sensibleDefaults.tcfFirstPosition)) + .tcfVerbatim(doubleFromRequest(request, "tcfVerbatim", sensibleDefaults.tcfVerbatim)) + .tcfProximity(doubleFromRequest(request, "tcfProximity", sensibleDefaults.tcfProximity)) + .bm25Params(new Bm25Parameters( + doubleFromRequest(request, "bm25.k1", sensibleDefaults.bm25Params.k()), + doubleFromRequest(request, "bm25.b", sensibleDefaults.bm25Params.b()) )) .temporalBias(ResultRankingParameters.TemporalBias.valueOf(stringFromRequest(request, "temporalBias", sensibleDefaults.temporalBias.toString()))) .temporalBiasWeight(doubleFromRequest(request, "temporalBiasWeight", sensibleDefaults.temporalBiasWeight)) .shortSentenceThreshold(intFromRequest(request, "shortSentenceThreshold", sensibleDefaults.shortSentenceThreshold)) .shortSentencePenalty(doubleFromRequest(request, "shortSentencePenalty", sensibleDefaults.shortSentencePenalty)) - .bm25FullWeight(doubleFromRequest(request, "bm25FullWeight", sensibleDefaults.bm25FullWeight)) - .bm25NgramWeight(doubleFromRequest(request, "bm25NgramWeight", sensibleDefaults.bm25NgramWeight)) - .bm25PrioWeight(doubleFromRequest(request, "bm25PrioWeight", sensibleDefaults.bm25PrioWeight)) + .bm25Weight(doubleFromRequest(request, "bm25Weight", sensibleDefaults.bm25Weight)) .exportDebugData(true) .build(); } diff --git a/code/services-core/query-service/resources/templates/qdebug.hdb b/code/services-core/query-service/resources/templates/qdebug.hdb index 4081317f..5e71d13b 100644 --- a/code/services-core/query-service/resources/templates/qdebug.hdb +++ b/code/services-core/query-service/resources/templates/qdebug.hdb @@ -31,22 +31,20 @@
-
-
-
-
+
+
-
-
-
-
+
+
+
+
-
-
-
-
+
+
+
+
@@ -67,12 +65,8 @@
-
-
-
-
-
-
+
+
{{/with}} @@ -89,9 +83,9 @@ Search Terms Exclude{{#each specs.query.searchTermsExclude}} {{.}} {{/each}} Search Terms Advice{{#each specs.query.searchTermsAdvice}} {{.}} {{/each}} Search Terms Priority{{#each specs.query.searchTermsPriority}} {{.}} {{/each}} -{{#each specs.query.searchTermCoherences}} +{{#each specs.query.phraseConstraints}} - Coherence Requirement + Phrase Constraints {{#each .}} {{.}} @@ -112,29 +106,26 @@

{{description}}

dataHash: {{dataHash}} wordsTotal: {{wordsTotal}} bestPositions: {{bestPositions}} rankingScore: {{rankingScore}} urlQuality: {{urlQuality}}
- {{#with rankingDetails.inputs}} -
Rank: {{rank}}
-
ASL: {{asl}}
-
Quality: {{quality}}
-
Size: {{size}}
-
Topology: {{topology}}
-
Year: {{year}}
-
Flags: {{#each flags}} {{.}} {{/each}}
+ {{#with rankingDetails.docFactorGroups}} + {{#each .}} +
{{name}}
+ {{#each factors}} +
{{factor}}: {{value}}
+ {{/each}} + {{/each}} {{/with}} - {{#with rankingDetails.outputs}} -
Average Sentence Length Penalty: {{averageSentenceLengthPenalty}}
-
Quality Penalty: {{qualityPenalty}}
-
Ranking Bonus: {{rankingBonus}}
-
Topology Bonus: {{topologyBonus}}
-
Document Length Penalty: {{documentLengthPenalty}}
-
Temporal Bias: {{temporalBias}}
-
Flags Penalty: {{flagsPenalty}}
-
Overall Part: {{overallPart}}
-
TCF Overlap: {{tcfOverlap}}
-
TCF Jaccard: {{tcfJaccard}}
-
BM25 Full: {{bM25F}}
-
BM25 Ngram: {{bM25N}}
-
BM25 Prio: {{bM25P}}
+ + {{#with rankingDetails.termFactorGroups}} + {{#each .}} +
{{termId}}:{{term}}
+ {{#each factorList}} +
{{name}}
+ {{#each factors}} +
{{factor}}: {{value}}
+ {{/each}} + + {{/each}} + {{/each}} {{/with}} diff --git a/code/tools/experiment-runner/build.gradle b/code/tools/experiment-runner/build.gradle index 17be5cb4..d011a973 100644 --- a/code/tools/experiment-runner/build.gradle +++ b/code/tools/experiment-runner/build.gradle @@ -32,14 +32,12 @@ dependencies { implementation project(':code:libraries:language-processing') implementation project(':code:libraries:term-frequency-dict') implementation project(':code:processes:converting-process') - implementation project(':code:process-models:crawling-model') + implementation project(':code:processes:crawling-process:model') implementation project(':third-party:commons-codec') - implementation project(':code:features-crawl:link-parser') - implementation project(':code:features-convert:adblock') - implementation project(':code:features-convert:anchor-keywords') - implementation project(':code:features-convert:topic-detection') - implementation project(':code:features-convert:keyword-extraction') + implementation project(':code:processes:crawling-process:ft-link-parser') + implementation project(':code:processes:converting-process:ft-anchor-keywords') + implementation project(':code:processes:converting-process:ft-keyword-extraction') implementation libs.bundles.slf4j implementation libs.notnull diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java index b5f9ff40..1797c1d6 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/Experiment.java @@ -1,6 +1,6 @@ package nu.marginalia.tools; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import java.io.IOException; import java.util.HashSet; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java index 668a25a9..08d2a662 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/ExperimentRunnerMain.java @@ -3,14 +3,15 @@ package nu.marginalia.tools; import com.google.inject.Guice; import com.google.inject.Injector; import nu.marginalia.converting.ConverterModule; -import nu.marginalia.crawling.io.CrawledDomainReader; +import nu.marginalia.io.crawldata.CrawledDomainReader; import nu.marginalia.process.log.WorkLog; import nu.marginalia.service.module.DatabaseModule; import nu.marginalia.tools.experiments.*; import java.io.IOException; import java.nio.file.Path; -import java.util.*; +import java.util.Arrays; +import java.util.Map; public class ExperimentRunnerMain { @@ -18,7 +19,6 @@ public class ExperimentRunnerMain { "test", TestExperiment.class, "adblock", AdblockExperiment.class, "topic", TopicExperiment.class, - "atags", AtagsExperiment.class, "sentence-statistics", SentenceStatisticsExperiment.class, "site-statistics", SiteStatisticsExperiment.class, "export-atags", ExportExternalLinksExperiment.class, @@ -27,7 +27,7 @@ public class ExperimentRunnerMain { public static void main(String... args) throws IOException { if (args.length < 2) { - System.err.println("Expected arguments: plan.yaml experiment-name [experiment-args]"); + System.err.println("Expected arguments: crawl-data-path experiment-name [experiment-args]"); return; } diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java index 5d7d8d11..effb216f 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/LegacyExperiment.java @@ -1,8 +1,8 @@ package nu.marginalia.tools; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; import java.io.IOException; import java.util.ArrayList; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java index 70856439..dc46f3bd 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AdblockExperiment.java @@ -1,10 +1,10 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; -import nu.marginalia.adblock.AdblockSimulator; import nu.marginalia.converting.processor.DocumentProcessor; -import nu.marginalia.crawling.model.CrawledDocument; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.converting.processor.classifier.adblock.AdblockSimulator; +import nu.marginalia.model.crawldata.CrawledDocument; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java deleted file mode 100644 index d08ec90f..00000000 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/AtagsExperiment.java +++ /dev/null @@ -1,52 +0,0 @@ -package nu.marginalia.tools.experiments; - -import com.google.inject.Inject; -import com.zaxxer.hikari.HikariDataSource; -import lombok.SneakyThrows; -import nu.marginalia.ProcessConfiguration; -import nu.marginalia.atags.AnchorTextKeywords; -import nu.marginalia.atags.source.AnchorTagsSource; -import nu.marginalia.atags.source.AnchorTagsSourceFactory; -import nu.marginalia.crawling.model.CrawledDomain; -import nu.marginalia.model.EdgeDomain; -import nu.marginalia.model.EdgeUrl; -import nu.marginalia.tools.LegacyExperiment; - -import java.sql.SQLException; - -public class AtagsExperiment extends LegacyExperiment { - - - private final AnchorTextKeywords keywords; - private final AnchorTagsSource source; - - @Inject - public AtagsExperiment(AnchorTextKeywords keywords, HikariDataSource dataSource) throws SQLException { - this.keywords = keywords; - this.source = new AnchorTagsSourceFactory(dataSource, new ProcessConfiguration(null, 1, null)) - .create(); - - } - - @Override - @SneakyThrows - public boolean process(CrawledDomain domain) { - var atags = source.getAnchorTags(new EdgeDomain(domain.domain)); - for (var doc : domain.doc) { - if (doc.documentBody == null) - continue; - - var newKeywords = keywords.getAnchorTextKeywords(atags, new EdgeUrl(doc.url)); - if (!newKeywords.isEmpty()) { - System.out.println(newKeywords + " " + doc.url); - } - } - return true; - } - - @Override - @SneakyThrows - public void onFinish() { - source.close(); - } -} diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java index 8290a658..4a34a31c 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/DebugConverterExperiment.java @@ -3,7 +3,7 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import nu.marginalia.converting.processor.DomainProcessor; import nu.marginalia.converting.processor.plugin.specialization.BlogSpecialization; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java index f602a837..1d49536f 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/ExportExternalLinksExperiment.java @@ -3,11 +3,11 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import gnu.trove.set.hash.TLongHashSet; import lombok.SneakyThrows; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; -import nu.marginalia.crawling.model.CrawledDocument; import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.link_parser.LinkParser; import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.crawldata.CrawledDocument; import nu.marginalia.tools.Experiment; import org.jsoup.Jsoup; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java index dde7a106..030024bd 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SentenceStatisticsExperiment.java @@ -4,11 +4,11 @@ import com.google.inject.Inject; import lombok.SneakyThrows; import nu.marginalia.WmsaHome; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.keyword.DocumentKeywordExtractor; +import nu.marginalia.keyword.LinkTexts; import nu.marginalia.language.sentence.SentenceExtractor; import nu.marginalia.model.EdgeUrl; -import nu.marginalia.segmentation.NgramLexicon; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.term_frequency_dict.TermFrequencyDict; import nu.marginalia.tools.LegacyExperiment; import org.jsoup.Jsoup; @@ -17,15 +17,14 @@ import java.io.BufferedOutputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.PrintWriter; +import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; public class SentenceStatisticsExperiment extends LegacyExperiment { - NgramLexicon lexicon = new NgramLexicon(WmsaHome.getLanguageModels()); SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); - DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor( - new TermFrequencyDict(WmsaHome.getLanguageModels()), lexicon); + DocumentKeywordExtractor documentKeywordExtractor = new DocumentKeywordExtractor(new TermFrequencyDict(WmsaHome.getLanguageModels())); Path filename; PrintWriter writer; @@ -47,6 +46,7 @@ public class SentenceStatisticsExperiment extends LegacyExperiment { logLine("Processing: " + domain.domain); + ByteBuffer workArea = ByteBuffer.allocate(8192); for (var doc : domain.doc) { if (doc.documentBody == null) continue; @@ -55,9 +55,9 @@ public class SentenceStatisticsExperiment extends LegacyExperiment { parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); - var keywords = documentKeywordExtractor.extractKeywords(dld, new EdgeUrl(doc.url)); + var keywords = documentKeywordExtractor.extractKeywords(dld, new LinkTexts(), new EdgeUrl(doc.url)); - keywords.build(); + keywords.build(workArea); } return true; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java index 0afb290f..d69b1bda 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/SiteStatisticsExperiment.java @@ -3,7 +3,7 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; import nu.marginalia.converting.model.ProcessedDocument; import nu.marginalia.converting.processor.DomainProcessor; -import nu.marginalia.crawling.io.SerializableCrawlDataStream; +import nu.marginalia.io.crawldata.SerializableCrawlDataStream; import nu.marginalia.model.EdgeUrl; import nu.marginalia.tools.Experiment; diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java index 521b36e8..436b227d 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TestExperiment.java @@ -1,6 +1,6 @@ package nu.marginalia.tools.experiments; -import nu.marginalia.crawling.model.CrawledDomain; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; public class TestExperiment extends LegacyExperiment { diff --git a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java index 0d99356a..00ed63ac 100644 --- a/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java +++ b/code/tools/experiment-runner/java/nu/marginalia/tools/experiments/TopicExperiment.java @@ -1,25 +1,30 @@ package nu.marginalia.tools.experiments; import com.google.inject.Inject; +import lombok.SneakyThrows; import nu.marginalia.WmsaHome; -import nu.marginalia.adblock.GoogleAnwersSpamDetector; +import nu.marginalia.converting.processor.classifier.topic.AdHocDetector; import nu.marginalia.converting.processor.logic.dom.DomPruningFilter; -import nu.marginalia.crawling.model.CrawledDomain; import nu.marginalia.language.sentence.SentenceExtractor; +import nu.marginalia.model.crawldata.CrawledDomain; import nu.marginalia.tools.LegacyExperiment; -import nu.marginalia.topic.RecipeDetector; -import nu.marginalia.topic.TextileCraftDetector; -import nu.marginalia.topic.WoodworkingDetector; import org.jsoup.Jsoup; +import java.nio.file.Files; +import java.nio.file.Path; + public class TopicExperiment extends LegacyExperiment { - RecipeDetector recipeDetector = new RecipeDetector(); - WoodworkingDetector woodworkingDetector = new WoodworkingDetector(); - TextileCraftDetector textileCraftDetector = new TextileCraftDetector(); - GoogleAnwersSpamDetector spamDetector = new GoogleAnwersSpamDetector(); + AdHocDetector detector; SentenceExtractor se = new SentenceExtractor(WmsaHome.getLanguageModels()); + Path filename = null; + + @SneakyThrows + public void args(String... args) { + filename = Path.of(args[0]); + detector = new AdHocDetector(Files.readAllLines(filename)); + } @Inject public TopicExperiment() { @@ -38,20 +43,11 @@ public class TopicExperiment extends LegacyExperiment { parsed.body().filter(new DomPruningFilter(0.5)); var dld = se.extractSentences(parsed); - if (dld.totalNumWords() < 250) + if (dld.totalNumWords() < 50) continue; - if (textileCraftDetector.testP(dld) > 0.3) { - System.out.println("textilecraft\t" + doc.url); - } - if (woodworkingDetector.testP(dld) > 0.1) { - System.out.println("woodworking\t" + doc.url); - } - if (recipeDetector.testP(dld) > 0.5) { - System.out.println("recipe\t" + doc.url); - } - if (spamDetector.testP(parsed) > 0.5) { - System.out.println("GA spam\t" + doc.url); + if (detector.testP(dld) > 0.5) { + System.out.println("match\t" + doc.url); } } diff --git a/code/tools/integration-test/build.gradle b/code/tools/integration-test/build.gradle new file mode 100644 index 00000000..81e3cde9 --- /dev/null +++ b/code/tools/integration-test/build.gradle @@ -0,0 +1,52 @@ +plugins { + id 'java' + + + id 'jvm-test-suite' +} + +java { + toolchain { + languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion)) + } +} + +apply from: "$rootProject.projectDir/srcsets.gradle" + +dependencies { + implementation project(':code:processes:crawling-process') + implementation project(':code:processes:converting-process') + implementation project(':code:processes:loading-process') + implementation project(':code:processes:crawling-process:model') + implementation project(':code:processes:converting-process:model') + implementation project(':code:processes:index-constructor-process') + implementation project(':code:index') + implementation project(':code:libraries:array') + implementation project(':code:libraries:btree') + implementation project(':code:functions:search-query:api') + implementation project(':code:index:index-reverse') + implementation project(':code:index:index-forward') + implementation project(':code:index:query') + implementation project(':code:index:index-journal') + implementation project(':code:functions:link-graph:partition') + implementation project(':code:functions:search-query') + implementation project(':code:libraries:array') + implementation project(':code:common:db') + implementation project(':code:common:config') + implementation project(':code:common:linkdb') + implementation project(':code:common:process') + implementation project(':code:common:service') + implementation project(':code:common:model') + + implementation libs.bundles.slf4j + implementation libs.bundles.grpc + implementation libs.mockito + implementation libs.notnull + implementation libs.guice + implementation libs.fastutil + implementation libs.trove + + testImplementation libs.bundles.junit + testImplementation project(':code:libraries:test-helpers') +} + diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java new file mode 100644 index 00000000..7fbcdefc --- /dev/null +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -0,0 +1,290 @@ +package nu.marginalia; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import nu.marginalia.api.searchquery.QueryProtobufCodec; +import nu.marginalia.api.searchquery.RpcQsQuery; +import nu.marginalia.api.searchquery.RpcQueryLimits; +import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; +import nu.marginalia.converting.processor.DomainProcessor; +import nu.marginalia.converting.writer.ConverterBatchWriter; +import nu.marginalia.crawl.retreival.DomainProber; +import nu.marginalia.crawl.retreival.fetcher.ContentTags; +import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder; +import nu.marginalia.functions.searchquery.QueryFactory; +import nu.marginalia.index.IndexGrpcService; +import nu.marginalia.index.ReverseIndexFullFileNames; +import nu.marginalia.index.ReverseIndexPrioFileNames; +import nu.marginalia.index.construction.full.FullIndexConstructor; +import nu.marginalia.index.construction.prio.PrioIndexConstructor; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; +import nu.marginalia.index.index.StatefulIndex; +import nu.marginalia.index.journal.IndexJournal; +import nu.marginalia.index.model.SearchParameters; +import nu.marginalia.index.searchset.SearchSetAny; +import nu.marginalia.io.crawldata.CrawledDomainReader; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; +import nu.marginalia.loading.LoaderIndexJournalWriter; +import nu.marginalia.loading.LoaderInputData; +import nu.marginalia.loading.documents.DocumentLoaderService; +import nu.marginalia.loading.documents.KeywordLoaderService; +import nu.marginalia.loading.domains.DomainIdRegistry; +import nu.marginalia.loading.links.DomainLinksLoaderService; +import nu.marginalia.model.EdgeDomain; +import nu.marginalia.model.EdgeUrl; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.test.IntegrationTestModule; +import nu.marginalia.test.TestUtil; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.List; + +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; + +public class IntegrationTest { + IntegrationTestModule testModule; + @Inject + DomainProcessor domainProcessor; + + @Inject + DomainLinksLoaderService linksService; + @Inject + KeywordLoaderService keywordLoaderService; + @Inject + DocumentLoaderService documentLoaderService; + + @Inject + FileStorageService fileStorageService; + + @Inject + DomainRankings domainRankings; + + @Inject + DocumentDbWriter documentDbWriter; + @Inject + LoaderIndexJournalWriter journalWriter; + + Path warcData = null; + Path crawlDataParquet = null; + Path processedDataDir = null; + + @Inject + StatefulIndex statefulIndex; + @Inject + IndexGrpcService indexGrpcService; + @Inject + DocumentDbReader documentDbReader; + + @Inject + QueryFactory queryFactory; + + @BeforeEach + public void setupTest() throws IOException { + testModule = new IntegrationTestModule(); + + Guice.createInjector(testModule).injectMembers(this); + + warcData = Files.createTempFile("warc", ".warc.gz"); + crawlDataParquet = Files.createTempFile("crawl", ".parquet"); + processedDataDir = Files.createTempDirectory("processed"); + } + + @AfterEach + public void tearDownTest() throws IOException { + Files.deleteIfExists(warcData); + Files.deleteIfExists(crawlDataParquet); + TestUtil.clearTempDir(processedDataDir); + + testModule.cleanUp(); + } + + + @Test + public void run() throws Exception { + + /** CREATE WARC */ + try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) { + warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"), + new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/"))); + + warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"), + "text/html", 200, + """ + +

Hello World

+ +

The best description of my problem solving process is the Feynman algorithm, which is sometimes presented as a joke where the hidden subtext is “be smart”, but I disagree. The “algorithm” is a surprisingly lucid description of how thinking works in the context of hard problems where the answer can’t simply be looked up or trivially broken down, iterated upon in a bottom-up fashion, or approached with similar methods. + The trick is that there is no trick. This is how thinking works. It appears that when you feed your brain related information, without further active involvement, it starts to digest the information you’ve fed it. +

+ + + """, + ContentTags.empty() + ); + } + + /** CONVERT WARC */ + CrawledDocumentParquetRecordFileWriter.convertWarc( + "www.example.com", + new UserAgent("search.marginalia.nu", + "search.marginalia.nu"), + warcData, + crawlDataParquet); + + /** PROCESS CRAWL DATA */ + + var processedDomain = domainProcessor.fullProcessing(CrawledDomainReader.createDataStream(crawlDataParquet)); + + System.out.println(processedDomain); + + /** WRITE PROCESSED DATA */ + + try (ConverterBatchWriter cbw = new ConverterBatchWriter(processedDataDir, 0)) { + cbw.writeProcessedDomain(processedDomain); + + } + // Write a single batch-switch marker in the process log so that the loader will read the data + Files.writeString(processedDataDir.resolve("processor.log"), "F\n", StandardOpenOption.CREATE_NEW); + + /** LOAD PROCESSED DATA */ + + LoaderInputData inputData = new LoaderInputData(List.of(processedDataDir)); + + DomainIdRegistry domainIdRegistry = Mockito.mock(DomainIdRegistry.class); + when(domainIdRegistry.getDomainId(any())).thenReturn(1); + + linksService.loadLinks(domainIdRegistry, new FakeProcessHeartbeat(), inputData); + keywordLoaderService.loadKeywords(domainIdRegistry, new FakeProcessHeartbeat(), inputData); + documentLoaderService.loadDocuments(domainIdRegistry, new FakeProcessHeartbeat(), inputData); + + // These must be closed to finalize the associated files + documentDbWriter.close(); + keywordLoaderService.close(); + + /** CONSTRUCT INDEX */ + + createForwardIndex(); + createFullReverseIndex(); + createPrioReverseIndex(); + + /** SWITCH INDEX */ + + statefulIndex.switchIndex(); + + // Move the docdb file to the live location + Files.move( + IndexLocations.getLinkdbWritePath(fileStorageService).resolve(DOCDB_FILE_NAME), + IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME) + ); + // Reconnect the document reader to the new docdb file + documentDbReader.reconnect(); + + /** QUERY */ + + var request = RpcQsQuery.newBuilder() + .setQueryLimits(RpcQueryLimits.newBuilder() + .setTimeoutMs(1000) + .setResultsTotal(100) + .setResultsByDomain(10) + .setFetchSize(1000) + .build()) + .setQueryStrategy("AUTO") + .setHumanQuery("\"This is how thinking works\"") + .build(); + + var params = QueryProtobufCodec.convertRequest(request); + + var p = ResultRankingParameters.sensibleDefaults(); + p.exportDebugData = true; + var query = queryFactory.createQuery(params, p); + + + var indexRequest = QueryProtobufCodec.convertQuery(request, query); + + System.out.println(indexRequest); + + var rs = indexGrpcService.executeSearch(new SearchParameters(indexRequest, new SearchSetAny())); + + System.out.println(rs); + } + + + private void createFullReverseIndex() throws IOException { + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT); + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path tmpDir = workDir.resolve("tmp"); + + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + var constructor = new FullIndexConstructor( + outputFileDocs, + outputFileWords, + outputFilePositions, + this::addRankToIdEncoding, + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexFull", workDir); + + } + + private void createPrioReverseIndex() throws IOException { + + Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path tmpDir = workDir.resolve("tmp"); + + var constructor = new PrioIndexConstructor( + outputFileDocs, + outputFileWords, + this::addRankToIdEncoding, + tmpDir); + + constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexPrio", workDir); + } + + private void createForwardIndex() throws IOException { + + Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService); + Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + + ForwardIndexConverter converter = new ForwardIndexConverter(new FakeProcessHeartbeat(), + outputFileDocsId, + outputFileDocsData, + outputFileSpansData, + IndexJournal.findJournal(workDir).orElseThrow(), + domainRankings + ); + + converter.convert(); + } + + private long addRankToIdEncoding(long docId) { + return UrlIdCodec.addRank( + 255, + docId); + } + +} diff --git a/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java b/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java new file mode 100644 index 00000000..83f79fbf --- /dev/null +++ b/code/tools/integration-test/test/nu/marginalia/test/IntegrationTestModule.java @@ -0,0 +1,165 @@ +package nu.marginalia.test; + +import com.google.inject.AbstractModule; +import com.google.inject.Inject; +import com.google.inject.Provides; +import com.google.inject.Singleton; +import com.google.inject.name.Names; +import gnu.trove.list.array.TIntArrayList; +import nu.marginalia.IndexLocations; +import nu.marginalia.LanguageModels; +import nu.marginalia.ProcessConfiguration; +import nu.marginalia.WmsaHome; +import nu.marginalia.db.DomainTypes; +import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.journal.IndexJournalSlopWriter; +import nu.marginalia.index.searchset.SearchSetAny; +import nu.marginalia.index.searchset.SearchSetsService; +import nu.marginalia.linkdb.docs.DocumentDbReader; +import nu.marginalia.linkdb.docs.DocumentDbWriter; +import nu.marginalia.linkgraph.io.DomainLinksWriter; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.service.ServiceId; +import nu.marginalia.service.control.FakeServiceHeartbeat; +import nu.marginalia.service.control.ServiceEventLog; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.module.ServiceConfiguration; +import nu.marginalia.storage.FileStorageService; +import nu.marginalia.storage.model.FileStorageBase; +import nu.marginalia.storage.model.FileStorageBaseType; +import org.mockito.Mockito; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Random; +import java.util.UUID; + +import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME; +import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.when; + +public class IntegrationTestModule extends AbstractModule { + Path workDir; + Path slowDir; + Path fastDir; + Path indexDir; + + Random random = new Random(); + + public IntegrationTestModule() throws IOException { + workDir = Files.createTempDirectory("IntegrationTest"); + slowDir = workDir.resolve("slow"); + fastDir = workDir.resolve("fast"); + indexDir = workDir.resolve("index"); + + Files.createDirectory(slowDir); + Files.createDirectory(fastDir); + } + + public void cleanUp() { + TestUtil.clearTempDir(workDir); + } + + @Override + protected void configure() { + + try { + var fileStorageServiceMock = Mockito.mock(FileStorageService.class); + Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.WORK)) + .thenReturn(new FileStorageBase(null, null, 0,null, slowDir.toString())); + Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.CURRENT)) + .thenReturn(new FileStorageBase(null, null, 0,null, fastDir.toString())); + Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.STORAGE)) + .thenReturn(new FileStorageBase(null, null, 0, null, fastDir.toString())); + + bind(DocumentDbReader.class).toInstance(new DocumentDbReader( + IndexLocations.getLinkdbLivePath(fileStorageServiceMock) + .resolve(DOCDB_FILE_NAME) + )); + + bind(FileStorageService.class).toInstance(fileStorageServiceMock); + bind(ServiceHeartbeat.class).toInstance(new FakeServiceHeartbeat()); + bind(ProcessHeartbeat.class).toInstance(new FakeProcessHeartbeat()); + + SearchSetsService setsServiceMock = Mockito.mock(SearchSetsService.class); + when(setsServiceMock.getSearchSetByName("NONE")).thenReturn(new SearchSetAny()); + when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings()); + bind(SearchSetsService.class).toInstance(setsServiceMock); + + DomainTypes domainTypes = Mockito.mock(DomainTypes.class); + when(domainTypes.getAllDomainsByType(any())).thenReturn(new ArrayList<>()); + when(domainTypes.getKnownDomainsByType(any())).thenReturn(new TIntArrayList()); + when(domainTypes.downloadList(any())).thenReturn(new ArrayList<>()); + bind(DomainTypes.class).toInstance(domainTypes); + + bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class)); + + bind(IndexJournalSlopWriter.class).toInstance(new IndexJournalSlopWriter( + IndexLocations.getIndexConstructionArea(fileStorageServiceMock), + 0 + )); + + bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration( + ServiceId.Index, + 0, + "127.0.0.1", + "127.0.0.1", + randomPort(), + UUID.randomUUID() + )); + + bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration( + "TEST", + 0, + UUID.randomUUID())); + + bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.); + bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(32); + bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128); + bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255); + + bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(indexDir); + + bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels()); + + } catch (IOException | SQLException e) { + throw new RuntimeException(e); + } + + + } + + + @Inject + @Provides + @Singleton + private DocumentDbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException { + // Migrate + Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOCDB_FILE_NAME); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + return new DocumentDbWriter(dbPath); + } + + @Inject @Provides @Singleton + private DomainLinksWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException { + + Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME); + + if (Files.exists(dbPath)) { + Files.delete(dbPath); + } + + return new DomainLinksWriter(dbPath); + } + private int randomPort() { + return random.nextInt(10000, 30000); + } +} diff --git a/code/tools/screenshot-capture-tool/build.gradle b/code/tools/screenshot-capture-tool/build.gradle index a022c803..dd8f99c0 100644 --- a/code/tools/screenshot-capture-tool/build.gradle +++ b/code/tools/screenshot-capture-tool/build.gradle @@ -3,7 +3,7 @@ plugins { id 'application' id 'jvm-test-suite' - id 'com.google.cloud.tools.jib' version '3.4.2' + id 'com.google.cloud.tools.jib' version '3.4.3' } java { diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 48c0a02c..0d184210 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.8-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/settings.gradle b/settings.gradle index a67a0310..9d4810e5 100644 --- a/settings.gradle +++ b/settings.gradle @@ -37,12 +37,12 @@ include 'code:index:index-reverse' include 'code:libraries:array' include 'code:libraries:array:cpp' +include 'code:libraries:coded-sequence' include 'code:libraries:geo-ip' include 'code:libraries:btree' include 'code:libraries:easy-lsh' include 'code:libraries:guarded-regex' include 'code:libraries:random-write-funnel' -include 'code:libraries:next-prime' include 'code:libraries:blocking-thread-pool' include 'code:libraries:braille-block-punch-cards' include 'code:libraries:language-processing' @@ -55,21 +55,14 @@ include 'code:features-search:screenshots' include 'code:features-search:random-websites' include 'code:features-search:feedlot-client' -include 'code:features-convert:adblock' -include 'code:features-convert:anchor-keywords' -include 'code:features-convert:data-extractors' -include 'code:features-convert:stackexchange-xml' -include 'code:features-convert:reddit-json' -include 'code:features-convert:pubdate' -include 'code:features-convert:summary-extraction' -include 'code:features-convert:keyword-extraction' -include 'code:features-convert:topic-detection' +include 'code:processes:converting-process:ft-anchor-keywords' +include 'code:execution:data-extractors' -include 'code:features-crawl:crawl-blocklist' -include 'code:features-crawl:link-parser' -include 'code:features-crawl:content-type' +include 'code:processes:crawling-process:ft-crawl-blocklist' +include 'code:processes:crawling-process:ft-link-parser' +include 'code:processes:crawling-process:ft-content-type' -include 'code:process-mqapi' +include 'code:processes:process-mq-api' include 'code:common:db' include 'code:common:linkdb' @@ -80,20 +73,21 @@ include 'code:common:renderer' include 'code:common:process' include 'code:processes:converting-process' +include 'code:processes:converting-process:model' +include 'code:processes:converting-process:ft-keyword-extraction' + include 'code:processes:crawling-process' +include 'code:processes:crawling-process:model' + include 'code:processes:loading-process' include 'code:processes:index-constructor-process' include 'code:processes:test-data' include 'code:processes:website-adjacencies-calculator' -include 'code:process-models:crawling-model' -include 'code:process-models:work-log' -include 'code:process-models:crawl-spec' -include 'code:process-models:processed-data' - include 'code:tools:experiment-runner' include 'code:tools:screenshot-capture-tool' include 'code:tools:load-test' +include 'code:tools:integration-test' include 'third-party:porterstemmer' include 'third-party:symspell' @@ -112,6 +106,8 @@ dependencyResolutionManagement { maven { url "https://repo1.maven.org/maven2/" } maven { url "https://www2.ph.ed.ac.uk/maven2/" } maven { url "https://jitpack.io/" } + maven { url "https://artifacts.marginalia.nu/snapshots" } + exclusiveContent { forRepository { maven { @@ -123,6 +119,18 @@ dependencyResolutionManagement { includeModule("com.github.Marcono1234", "gson-record-type-adapter-factory") } } + + exclusiveContent { + forRepository { + maven { + url = uri("https://artifacts.marginalia.nu/snapshots") + } + } + filter { + // Only use the Marginalia snapshot repository for the `slop` library + includeModule("nu.marginalia", "slop") + } + } } versionCatalogs { @@ -218,6 +226,8 @@ dependencyResolutionManagement { library('jetty-util','org.eclipse.jetty','jetty-util').version('9.4.54.v20240208') library('jetty-servlet','org.eclipse.jetty','jetty-servlet').version('9.4.54.v20240208') + library('slop', 'nu.marginalia', 'slop').version('0.0.8-SNAPSHOT') + bundle('jetty', ['jetty-server', 'jetty-util', 'jetty-servlet']) bundle('slf4j', ['slf4j.api', 'log4j.api', 'log4j.core', 'log4j.slf4j']) diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/BinarySerializable.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/BinarySerializable.java new file mode 100644 index 00000000..0040b57c --- /dev/null +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/BinarySerializable.java @@ -0,0 +1,5 @@ +package blue.strategic.parquet; + +public interface BinarySerializable { + byte[] bytes(); +} diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java index 6d9b5734..53de4682 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ParquetWriter.java @@ -138,6 +138,15 @@ public final class ParquetWriter implements Closeable { SimpleWriteSupport.this.writeList(name, value); } + @Override + public void writeBinarySerializableList(String name, List value) { + if (value.isEmpty()) { + return; + } + + SimpleWriteSupport.this.writeBinarySerializableList(name, value); + } + @Override public void writeList(String name, TIntList value) { if (value.isEmpty()) { @@ -209,6 +218,18 @@ public final class ParquetWriter implements Closeable { recordConsumer.endField(name, fieldIndex); } + private void writeBinarySerializableList(String name, List values) { + int fieldIndex = schema.getFieldIndex(name); + PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); + recordConsumer.startField(name, fieldIndex); + + for (var value : values) { + writeValue(type, value.bytes()); + } + + recordConsumer.endField(name, fieldIndex); + } + private void writeList(String name, TIntList values) { int fieldIndex = schema.getFieldIndex(name); PrimitiveType type = schema.getType(fieldIndex).asPrimitiveType(); diff --git a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java index 962f3b50..aa07ba71 100644 --- a/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java +++ b/third-party/parquet-floor/src/main/java/blue/strategic/parquet/ValueWriter.java @@ -9,5 +9,6 @@ public interface ValueWriter { void write(String name, Object value); void writeList(String name, List value); void writeList(String name, TLongList value); + void writeBinarySerializableList(String name, List value); void writeList(String name, TIntList value); }