diff --git a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java index 3c2c4bbc..1cc1edd8 100644 --- a/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java +++ b/code/api/index-api/src/main/java/nu/marginalia/index/client/model/query/SearchSubquery.java @@ -2,12 +2,14 @@ package nu.marginalia.index.client.model.query; import lombok.AllArgsConstructor; import lombok.Getter; +import lombok.With; import java.util.List; import java.util.stream.Collectors; @Getter @AllArgsConstructor +@With public class SearchSubquery { /** These terms must be present in the document and are used in ranking*/ @@ -27,6 +29,14 @@ public class SearchSubquery { private double value = 0; + public SearchSubquery() { + this.searchTermsInclude = List.of(); + this.searchTermsExclude = List.of(); + this.searchTermsAdvice = List.of(); + this.searchTermsPriority = List.of(); + this.searchTermCoherences = List.of(); + } + public SearchSubquery(List searchTermsInclude, List searchTermsExclude, List searchTermsAdvice, diff --git a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java index 284f7df7..52ba642b 100644 --- a/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java +++ b/code/features-index/index-reverse/src/main/java/nu/marginalia/index/construction/ReversePreindex.java @@ -185,6 +185,8 @@ public class ReversePreindex { Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat"); Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat"); + // We need total size to request a direct LongArray range. Seems slower, but is faster. + // ... see LongArray.directRangeIfPossible(long start, long end) long segmentsSize = countDistinctElements(left.wordIds, right.wordIds, 0, left.wordIds.size(), 0, right.wordIds.size()); diff --git a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java index 9c6ca197..1ff444ad 100644 --- a/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java +++ b/code/services-core/index-service/src/main/java/nu/marginalia/index/results/IndexMetadataService.java @@ -139,7 +139,8 @@ public class IndexMetadataService { for (var coherenceSet : coherences.words()) { long overlap = 0xFF_FFFF_FFFF_FFFFL; for (var word : coherenceSet) { - overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId)); + long positions = WordMetadata.decodePositions(getTermMetadata(word, docId)); + overlap &= positions; } if (overlap == 0L) { return false; diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java new file mode 100644 index 00000000..560150f6 --- /dev/null +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationSmokeTest.java @@ -0,0 +1,299 @@ +package nu.marginalia.index.svc; + +import com.google.inject.Guice; +import com.google.inject.Inject; +import nu.marginalia.db.storage.FileStorageService; +import nu.marginalia.db.storage.model.FileStorage; +import nu.marginalia.db.storage.model.FileStorageType; +import nu.marginalia.hash.MurmurHash3_128; +import nu.marginalia.index.ReverseIndexFullFileNames; +import nu.marginalia.index.ReverseIndexPrioFileNames; +import nu.marginalia.index.client.model.query.SearchSpecification; +import nu.marginalia.index.client.model.query.SearchSubquery; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; +import nu.marginalia.index.client.model.results.ResultRankingParameters; +import nu.marginalia.index.client.model.results.SearchResultItem; +import nu.marginalia.index.construction.DocIdRewriter; +import nu.marginalia.index.construction.ReverseIndexConstructor; +import nu.marginalia.index.forward.ForwardIndexConverter; +import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.index.SearchIndex; +import nu.marginalia.index.journal.model.IndexJournalEntryData; +import nu.marginalia.index.journal.model.IndexJournalEntryHeader; +import nu.marginalia.index.journal.reader.IndexJournalReader; +import nu.marginalia.index.journal.writer.IndexJournalWriter; +import nu.marginalia.index.query.limit.QueryLimits; +import nu.marginalia.index.query.limit.QueryStrategy; +import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.model.id.UrlIdCodec; +import nu.marginalia.model.idx.DocumentFlags; +import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordMetadata; +import nu.marginalia.process.control.FakeProcessHeartbeat; +import nu.marginalia.process.control.ProcessHeartbeat; +import nu.marginalia.ranking.DomainRankings; +import nu.marginalia.service.control.ServiceHeartbeat; +import nu.marginalia.service.server.Initialization; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.parallel.Execution; +import spark.Spark; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.sql.SQLException; +import java.util.*; +import java.util.function.Function; +import java.util.stream.IntStream; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; + +@Execution(SAME_THREAD) +public class IndexQueryServiceIntegrationSmokeTest { + + @Inject + Initialization initialization; + + IndexQueryServiceIntegrationTestModule testModule; + + @Inject + IndexQueryService queryService; + @Inject + SearchIndex searchIndex; + + @Inject + ServiceHeartbeat heartbeat; + + @Inject + IndexJournalWriter indexJournalWriter; + + @Inject + FileStorageService fileStorageService; + + @Inject + DomainRankings domainRankings; + + @Inject + ProcessHeartbeat processHeartbeat; + + @BeforeEach + public void setUp() throws IOException { + + testModule = new IndexQueryServiceIntegrationTestModule(); + Guice.createInjector(testModule).injectMembers(this); + + initialization.setReady(); + } + + @AfterEach + public void tearDown() throws IOException { + testModule.cleanUp(); + + Spark.stop(); + } + + @Test + public void willItBlend() throws Exception { + for (int i = 1; i < 512; i++) { + loadData(i); + } + + indexJournalWriter.close(); + constructIndex(); + searchIndex.switchIndex(); + + var rsp = queryService.justQuery( + SearchSpecification.builder() + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) + .queryStrategy(QueryStrategy.SENTENCE) + .year(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) + .rankingParams(ResultRankingParameters.sensibleDefaults()) + .domains(new ArrayList<>()) + .searchSetIdentifier(SearchSetIdentifier.NONE) + .subqueries(List.of(new SearchSubquery( + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList()))).build()); + + int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; + long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); + long[] actual = rsp.results + .stream() + .mapToLong(SearchResultItem::getDocumentId) + .toArray(); + + Assertions.assertArrayEquals(ids, actual); + } + + @Test + public void testDomainQuery() throws Exception { + for (int i = 1; i < 512; i++) { + loadDataWithDomain(i/100, i); + } + + + indexJournalWriter.close(); + constructIndex(); + searchIndex.switchIndex(); + + var rsp = queryService.justQuery( + SearchSpecification.builder() + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) + .year(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) + .rankingParams(ResultRankingParameters.sensibleDefaults()) + .queryStrategy(QueryStrategy.SENTENCE) + .domains(List.of(2)) + .subqueries(List.of(new SearchSubquery( + List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList()))).build()); + int[] idxes = new int[] { 210, 270 }; + long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); + long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray(); + + Assertions.assertArrayEquals(ids, actual); + } + + @Test + public void testYearQuery() throws Exception { + for (int i = 1; i < 512; i++) { + loadData(i); + } + + indexJournalWriter.close(); + constructIndex(); + searchIndex.switchIndex(); + + var rsp = queryService.justQuery( + SearchSpecification.builder() + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) + .quality(SpecificationLimit.none()) + .year(SpecificationLimit.equals(1998)) + .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) + .queryStrategy(QueryStrategy.SENTENCE) + .searchSetIdentifier(SearchSetIdentifier.NONE) + .rankingParams(ResultRankingParameters.sensibleDefaults()) + .subqueries(List.of(new SearchSubquery( + List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), + Collections.emptyList())) + ).build()); + + + Set years = new HashSet<>(); + + for (var res : rsp.results) { + for (var score : res.getKeywordScores()) { + years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata())); + } + } + + assertEquals(Set.of(1998), years); + assertEquals(rsp.results.size(), 10); + + } + + + private void constructIndex() throws SQLException, IOException { + createForwardIndex(); + createFullReverseIndex(); + createPrioReverseIndex(); + } + + + private void createFullReverseIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT); + + Path tmpDir = indexStaging.asPath().resolve("tmp"); + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + + ReverseIndexConstructor. + createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords); + } + + private void createPrioReverseIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT); + Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT); + + Path tmpDir = indexStaging.asPath().resolve("tmp"); + if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir); + + ReverseIndexConstructor. + createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords); + } + + private void createForwardIndex() throws SQLException, IOException { + + FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE); + FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING); + + Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT); + Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT); + + ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat, + IndexJournalReader.paging(indexStaging.asPath()), + outputFileDocsId, + outputFileDocsData, + domainRankings + ); + + converter.convert(); + } + + private long fullId(int id) { + return UrlIdCodec.encodeId((32 - (id % 32)), id); + } + + MurmurHash3_128 hasher = new MurmurHash3_128(); + public void loadData(int id) { + int[] factors = IntStream + .rangeClosed(1, id) + .filter(v -> (id % v) == 0) + .toArray(); + + long fullId = fullId(id); + + var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); + data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + } + + indexJournalWriter.put(header, new IndexJournalEntryData(data)); + } + + public void loadDataWithDomain(int domain, int id) { + int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); + var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue()); + + long[] data = new long[factors.length*2]; + for (int i = 0; i < factors.length; i++) { + data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); + data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + } + + indexJournalWriter.put(header, new IndexJournalEntryData(data)); + } + +} diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java index 7efa08a2..a97d6952 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTest.java @@ -8,11 +8,10 @@ import nu.marginalia.db.storage.model.FileStorageType; import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.ReverseIndexFullFileNames; import nu.marginalia.index.ReverseIndexPrioFileNames; +import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.query.SearchSpecification; import nu.marginalia.index.client.model.query.SearchSubquery; -import nu.marginalia.index.client.model.query.SearchSetIdentifier; import nu.marginalia.index.client.model.results.ResultRankingParameters; -import nu.marginalia.index.client.model.results.SearchResultItem; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.ReverseIndexConstructor; import nu.marginalia.index.forward.ForwardIndexConverter; @@ -25,9 +24,11 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.query.limit.QueryLimits; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.SpecificationLimit; +import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.id.UrlIdCodec; -import nu.marginalia.model.idx.WordFlags; +import nu.marginalia.model.idx.DocumentFlags; import nu.marginalia.model.idx.DocumentMetadata; +import nu.marginalia.model.idx.WordFlags; import nu.marginalia.model.idx.WordMetadata; import nu.marginalia.process.control.FakeProcessHeartbeat; import nu.marginalia.process.control.ProcessHeartbeat; @@ -35,18 +36,18 @@ import nu.marginalia.ranking.DomainRankings; import nu.marginalia.service.control.ServiceHeartbeat; import nu.marginalia.service.server.Initialization; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.parallel.Execution; import spark.Spark; +import javax.annotation.CheckReturnValue; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.sql.SQLException; import java.util.*; -import java.util.stream.IntStream; +import java.util.function.Function; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD; @@ -96,38 +97,379 @@ public class IndexQueryServiceIntegrationTest { } @Test - public void willItBlend() throws Exception { - for (int i = 1; i < 512; i++) { - loadData(i); + public void testNoPositionsOnlyFlags() throws Exception { + // Test the case where positions are absent but flags are present + + new MockData().add( // should be included despite no position + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ).load(); + + var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + + executeSearch(query) + .expectDocumentsInOrder(d(1,1)); + } + + + @Test + public void testMissingKeywords() throws Exception { + // Test cases where the user enters search terms that are missing from the lexicon + + new MockData().add( + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", WordFlags.Title), + w("world", WordFlags.Title) + ).load(); + + var queryMissingExclude = basicQuery(builder -> + builder.subqueries(includeAndExclude("hello", "missing"))); + + executeSearch(queryMissingExclude) + .expectDocumentsInOrder(d(1,1)); + + var queryMissingInclude = basicQuery(builder -> + builder.subqueries(justInclude("missing"))); + + executeSearch(queryMissingInclude) + .expectCount(0); + + var queryMissingPriority = basicQuery(builder -> + builder.subqueries( + List.of( + new SearchSubquery( + List.of("hello"), + List.of(), + List.of(), + List.of("missing"), + List.of() + ) + ))); + + executeSearch(queryMissingPriority) + .expectCount(1); + + var queryMissingAdvice = basicQuery(builder -> + builder.subqueries( + List.of( + new SearchSubquery( + List.of("hello"), + List.of(), + List.of("missing"), + List.of(), + List.of() + ) + ))); + + executeSearch(queryMissingAdvice) + .expectCount(0); + + var queryMissingCoherence = basicQuery(builder -> + builder.subqueries( + List.of( + new SearchSubquery( + List.of("hello"), + List.of(), + List.of(), + List.of(), + List.of(List.of("missing", "hello")) + ) + ))); + + executeSearch(queryMissingCoherence) + .expectCount(0); + } + + @Test + public void testPositions() throws Exception { + + // Test position rules + new MockData() + .add( // Case 1: Both words have a position set, should be considered + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ).add( // Case 2: Only one of the words have a position set, should not be considered + d(2, 2), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode()) + ).load(); + + + var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world"))); + + executeSearch(query) + .expectDocumentsInOrder(d(1,1)); + } + + @Test + public void testYear() throws Exception { + + // Test year rules + new MockData() + .add( // Case 1: Document is dated 1999 + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ).add( // Case 2: Document is dated 2000 + d(2, 2), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + .add( // Case 2: Document is dated 2001 + d(3, 3), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + .load(); + + + var beforeY2K = basicQuery(builder -> + builder.subqueries(justInclude("hello", "world")) + .year(SpecificationLimit.lessThan(2000)) + ); + var atY2K = basicQuery(builder -> + builder.subqueries(justInclude("hello", "world")) + .year(SpecificationLimit.equals(2000)) + ); + var afterY2K = basicQuery(builder -> + builder.subqueries(justInclude("hello", "world")) + .year(SpecificationLimit.greaterThan(2000)) + ); + + executeSearch(beforeY2K) + .expectDocumentsInOrder( + d(1,1), + d(2,2) + ); + executeSearch(atY2K) + .expectDocumentsInOrder( + d(2,2) + ); + executeSearch(afterY2K) + .expectDocumentsInOrder( + d(2,2), + d(3,3) + ); + } + + @Test + public void testDomain() throws Exception { + + // Test domain filtering + new MockData() + // docs from domain 1 + .add( + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ).add( + d(1, 2), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + // docs from domain 2 + .add( + d(2, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + .add( + d(2, 2), + new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + .load(); + + + var domain1 = basicQuery(builder -> + builder.subqueries(justInclude("hello", "world")) + .domains(List.of(1)) + ); + var domain2 = basicQuery(builder -> + builder.subqueries(justInclude("hello", "world")) + .domains(List.of(2)) + ); + + executeSearch(domain1) + .expectDocumentsInOrder( + d(1,1), + d(1,2) + ); + executeSearch(domain2) + .expectDocumentsInOrder( + d(2,1), + d(2,2) + ); + } + + @Test + public void testExclude() throws Exception { + + // Test exclude rules + new MockData() + .add( // Case 1: The required include is present, exclude is absent; should be a result + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ).add( // Case 2: The required include is present, excluded term is absent; should not be a result + d(2, 2), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("my_darling", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode()) + ).load(); + + var query = basicQuery(builder -> + builder.subqueries(includeAndExclude("hello", "my_darling")) + ); + + executeSearch(query) + .expectDocumentsInOrder(d(1,1)); + } + + static class ResultWrapper { + private final List actual; + + ResultWrapper(List actual) { + this.actual = actual; } - indexJournalWriter.close(); - constructIndex(); - searchIndex.switchIndex(); + public ResultWrapper expectDocumentsInOrder(MockDataDocument... expectedDocs) { + assertEquals(List.of(expectedDocs), actual); + + return this; + } + public ResultWrapper expectDocumentInAnyOrder(MockDataDocument... expectedDocs) { + assertEquals(Set.of(expectedDocs), new HashSet<>(actual)); + + return this; + } + public ResultWrapper expectCount(int count) { + assertEquals(count, actual.size()); + + return this; + } + } + + @CheckReturnValue + ResultWrapper executeSearch(SearchSpecification searchSpecification) { + var rsp = queryService.justQuery(searchSpecification); + + List actual = new ArrayList<>(); + + System.out.println(rsp); + + for (var result : rsp.results) { + long docId = result.getDocumentId(); + actual.add(new MockDataDocument(UrlIdCodec.getDomainId(docId), UrlIdCodec.getDocumentOrdinal(docId))); + } + + return new ResultWrapper(actual); + } + + + @Test + public void testCoherenceRequirement() throws Exception { + + // Test coherence requirement. Two terms are considered coherent when they + // appear in the same position + new MockData() + .add( // Case 1: Both positions overlap; should be included + d(1, 1), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + .add( // Case 2: Positions do not overlap, do not include + d(2, 2), + new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))), + w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()), + w("world", new WordMetadata(2L, EnumSet.noneOf(WordFlags.class)).encode()) + ) + .load(); var rsp = queryService.justQuery( - SearchSpecification.builder() - .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) - .queryStrategy(QueryStrategy.SENTENCE) - .year(SpecificationLimit.none()) - .quality(SpecificationLimit.none()) - .size(SpecificationLimit.none()) - .rank(SpecificationLimit.none()) - .rankingParams(ResultRankingParameters.sensibleDefaults()) - .domains(new ArrayList<>()) - .searchSetIdentifier(SearchSetIdentifier.NONE) - .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); + basicQuery(builder -> builder.subqueries( + // note coherence requriement + includeAndCohere("hello", "world") + ))); - int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 }; - long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray(); - long[] actual = rsp.results - .stream() - .mapToLong(SearchResultItem::getDocumentId) - .toArray(); + assertEquals(1, rsp.results.size()); + assertEquals(d(1,1).docId(), + rsp.results.get(0).getDocumentId()); + } - Assertions.assertArrayEquals(ids, actual); + SearchSpecification basicQuery(Function mutator) + { + var builder = SearchSpecification.builder() + .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) + .queryStrategy(QueryStrategy.SENTENCE) + .year(SpecificationLimit.none()) + .quality(SpecificationLimit.none()) + .size(SpecificationLimit.none()) + .rank(SpecificationLimit.none()) + .rankingParams(ResultRankingParameters.sensibleDefaults()) + .domains(new ArrayList<>()) + .searchSetIdentifier(SearchSetIdentifier.NONE) + .subqueries(List.of()); + + return mutator.apply(builder).build(); + } + + List justInclude(String... includes) { + return List.of(new SearchSubquery( + List.of(includes), + List.of(), + List.of(), + List.of(), + List.of() + )); + } + + List includeAndExclude(List includes, List excludes) { + return List.of(new SearchSubquery( + includes, + excludes, + List.of(), + List.of(), + List.of() + )); + } + + List includeAndExclude(String include, String exclude) { + return List.of(new SearchSubquery( + List.of(include), + List.of(exclude), + List.of(), + List.of(), + List.of() + )); + } + + List includeAndCohere(String... includes) { + return List.of(new SearchSubquery( + List.of(includes), + List.of(), + List.of(), + List.of(), + List.of(List.of(includes)) + )); + } + private MockDataDocument d(int domainId, int ordinal) { + return new MockDataDocument(domainId, ordinal); } private void constructIndex() throws SQLException, IOException { @@ -186,110 +528,64 @@ public class IndexQueryServiceIntegrationTest { converter.convert(); } - @Test - public void testDomainQuery() throws Exception { - for (int i = 1; i < 512; i++) { - loadDataWithDomain(i/100, i); - } - - - indexJournalWriter.close(); - constructIndex(); - searchIndex.switchIndex(); - - var rsp = queryService.justQuery( - SearchSpecification.builder() - .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) - .year(SpecificationLimit.none()) - .quality(SpecificationLimit.none()) - .size(SpecificationLimit.none()) - .rank(SpecificationLimit.none()) - .rankingParams(ResultRankingParameters.sensibleDefaults()) - .queryStrategy(QueryStrategy.SENTENCE) - .domains(List.of(2)) - .subqueries(List.of(new SearchSubquery( - List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList()))).build()); - int[] idxes = new int[] { 210, 270 }; - long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray(); - long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray(); - - Assertions.assertArrayEquals(ids, actual); - } - - @Test - public void testYearQuery() throws Exception { - for (int i = 1; i < 512; i++) { - loadData(i); - } - - indexJournalWriter.close(); - constructIndex(); - searchIndex.switchIndex(); - - var rsp = queryService.justQuery( - SearchSpecification.builder() - .queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000)) - .quality(SpecificationLimit.none()) - .year(SpecificationLimit.equals(1998)) - .size(SpecificationLimit.none()) - .rank(SpecificationLimit.none()) - .queryStrategy(QueryStrategy.SENTENCE) - .searchSetIdentifier(SearchSetIdentifier.NONE) - .rankingParams(ResultRankingParameters.sensibleDefaults()) - .subqueries(List.of(new SearchSubquery( - List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), - Collections.emptyList())) - ).build()); - - - Set years = new HashSet<>(); - - for (var res : rsp.results) { - for (var score : res.getKeywordScores()) { - years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata())); - } - } - - assertEquals(Set.of(1998), years); - assertEquals(rsp.results.size(), 10); - - } - - private long fullId(int id) { - return UrlIdCodec.encodeId((32 - (id % 32)), id); - } - MurmurHash3_128 hasher = new MurmurHash3_128(); - public void loadData(int id) { - int[] factors = IntStream - .rangeClosed(1, id) - .filter(v -> (id % v) == 0) - .toArray(); - long fullId = fullId(id); + class MockData { + private final Map> allData = new HashMap<>(); + private final Map metaByDoc = new HashMap<>(); - var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode()); + public MockData add(MockDataDocument document, + MockDocumentMeta meta, + MockDataKeyword... words) + { + long id = UrlIdCodec.encodeId(document.domainId, document.ordinal); - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words)); + metaByDoc.put(id, meta); + + return this; } - indexJournalWriter.put(header, new IndexJournalEntryData(data)); + void load() throws IOException, SQLException { + allData.forEach((doc, words) -> { + + var meta = metaByDoc.get(doc); + + var header = new IndexJournalEntryHeader( + doc, + meta.features, + meta.documentMetadata.encode() + ); + + long[] dataArray = new long[words.size() * 2]; + for (int i = 0; i < words.size(); i++) { + dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword); + dataArray[2*i+1] = words.get(i).termMetadata; + } + var entry = new IndexJournalEntryData(dataArray); + indexJournalWriter.put(header, entry); + }); + + indexJournalWriter.close(); + constructIndex(); + searchIndex.switchIndex(); + } } - public void loadDataWithDomain(int domain, int id) { - int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray(); - var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue()); - - long[] data = new long[factors.length*2]; - for (int i = 0; i < factors.length; i++) { - data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i])); - data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode(); + record MockDataDocument(int domainId, int ordinal) { + public long docId() { + return UrlIdCodec.encodeId(domainId, ordinal); } - indexJournalWriter.put(header, new IndexJournalEntryData(data)); } + record MockDocumentMeta(int features, DocumentMetadata documentMetadata) { + public MockDocumentMeta(int features, long encoded) { + this(features, new DocumentMetadata(encoded)); + } + } + record MockDataKeyword(String keyword, long termMetadata) {} + + public MockDataKeyword w(String keyword, long termMetadata) { return new MockDataKeyword(keyword, termMetadata); } + public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L); } + public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode()); } } diff --git a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java index 24fbff96..8dabef87 100644 --- a/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java +++ b/code/services-core/index-service/src/test/java/nu/marginalia/index/svc/IndexQueryServiceIntegrationTestModule.java @@ -4,7 +4,6 @@ import com.google.inject.AbstractModule; import nu.marginalia.db.storage.FileStorageService; import nu.marginalia.db.storage.model.FileStorage; import nu.marginalia.db.storage.model.FileStorageType; -import nu.marginalia.index.IndexServicesFactory; import nu.marginalia.index.journal.writer.IndexJournalWriter; import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl; import nu.marginalia.process.control.FakeProcessHeartbeat; @@ -35,7 +34,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule { Random random = new Random(); public IndexQueryServiceIntegrationTestModule() throws IOException { - workDir = Files.createTempDirectory(IndexQueryServiceIntegrationTest.class.getSimpleName()); + workDir = Files.createTempDirectory(IndexQueryServiceIntegrationSmokeTest.class.getSimpleName()); slowDir = workDir.resolve("slow"); fastDir = workDir.resolve("fast");