(index) Add more comprehensive integration tests for the index service.

This commit is contained in:
Viktor Lofgren 2023-08-30 10:37:24 +02:00
parent 048f685073
commit 764e7d1315
6 changed files with 736 additions and 129 deletions

View File

@ -2,12 +2,14 @@ package nu.marginalia.index.client.model.query;
import lombok.AllArgsConstructor;
import lombok.Getter;
import lombok.With;
import java.util.List;
import java.util.stream.Collectors;
@Getter
@AllArgsConstructor
@With
public class SearchSubquery {
/** These terms must be present in the document and are used in ranking*/
@ -27,6 +29,14 @@ public class SearchSubquery {
private double value = 0;
public SearchSubquery() {
this.searchTermsInclude = List.of();
this.searchTermsExclude = List.of();
this.searchTermsAdvice = List.of();
this.searchTermsPriority = List.of();
this.searchTermCoherences = List.of();
}
public SearchSubquery(List<String> searchTermsInclude,
List<String> searchTermsExclude,
List<String> searchTermsAdvice,

View File

@ -185,6 +185,8 @@ public class ReversePreindex {
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
// We need total size to request a direct LongArray range. Seems slower, but is faster.
// ... see LongArray.directRangeIfPossible(long start, long end)
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
0, left.wordIds.size(),
0, right.wordIds.size());

View File

@ -139,7 +139,8 @@ public class IndexMetadataService {
for (var coherenceSet : coherences.words()) {
long overlap = 0xFF_FFFF_FFFF_FFFFL;
for (var word : coherenceSet) {
overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId));
long positions = WordMetadata.decodePositions(getTermMetadata(word, docId));
overlap &= positions;
}
if (overlap == 0L) {
return false;

View File

@ -0,0 +1,299 @@
package nu.marginalia.index.svc;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.db.storage.model.FileStorage;
import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.index.SearchIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import spark.Spark;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import java.util.function.Function;
import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@Execution(SAME_THREAD)
public class IndexQueryServiceIntegrationSmokeTest {
@Inject
Initialization initialization;
IndexQueryServiceIntegrationTestModule testModule;
@Inject
IndexQueryService queryService;
@Inject
SearchIndex searchIndex;
@Inject
ServiceHeartbeat heartbeat;
@Inject
IndexJournalWriter indexJournalWriter;
@Inject
FileStorageService fileStorageService;
@Inject
DomainRankings domainRankings;
@Inject
ProcessHeartbeat processHeartbeat;
@BeforeEach
public void setUp() throws IOException {
testModule = new IndexQueryServiceIntegrationTestModule();
Guice.createInjector(testModule).injectMembers(this);
initialization.setReady();
}
@AfterEach
public void tearDown() throws IOException {
testModule.cleanUp();
Spark.stop();
}
@Test
public void willItBlend() throws Exception {
for (int i = 1; i < 512; i++) {
loadData(i);
}
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
long[] actual = rsp.results
.stream()
.mapToLong(SearchResultItem::getDocumentId)
.toArray();
Assertions.assertArrayEquals(ids, actual);
}
@Test
public void testDomainQuery() throws Exception {
for (int i = 1; i < 512; i++) {
loadDataWithDomain(i/100, i);
}
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2))
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
int[] idxes = new int[] { 210, 270 };
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray();
Assertions.assertArrayEquals(ids, actual);
}
@Test
public void testYearQuery() throws Exception {
for (int i = 1; i < 512; i++) {
loadData(i);
}
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.quality(SpecificationLimit.none())
.year(SpecificationLimit.equals(1998))
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE)
.searchSetIdentifier(SearchSetIdentifier.NONE)
.rankingParams(ResultRankingParameters.sensibleDefaults())
.subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))
).build());
Set<Integer> years = new HashSet<>();
for (var res : rsp.results) {
for (var score : res.getKeywordScores()) {
years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata()));
}
}
assertEquals(Set.of(1998), years);
assertEquals(rsp.results.size(), 10);
}
private void constructIndex() throws SQLException, IOException {
createForwardIndex();
createFullReverseIndex();
createPrioReverseIndex();
}
private void createFullReverseIndex() throws SQLException, IOException {
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path tmpDir = indexStaging.asPath().resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
ReverseIndexConstructor.
createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
}
private void createPrioReverseIndex() throws SQLException, IOException {
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path tmpDir = indexStaging.asPath().resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
ReverseIndexConstructor.
createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
}
private void createForwardIndex() throws SQLException, IOException {
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(indexStaging.asPath()),
outputFileDocsId,
outputFileDocsData,
domainRankings
);
converter.convert();
}
private long fullId(int id) {
return UrlIdCodec.encodeId((32 - (id % 32)), id);
}
MurmurHash3_128 hasher = new MurmurHash3_128();
public void loadData(int id) {
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
.toArray();
long fullId = fullId(id);
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
indexJournalWriter.put(header, new IndexJournalEntryData(data));
}
public void loadDataWithDomain(int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
indexJournalWriter.put(header, new IndexJournalEntryData(data));
}
}

View File

@ -8,11 +8,10 @@ import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.query.SearchSpecification;
import nu.marginalia.index.client.model.query.SearchSubquery;
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
import nu.marginalia.index.client.model.results.ResultRankingParameters;
import nu.marginalia.index.client.model.results.SearchResultItem;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
@ -25,9 +24,11 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.query.limit.QueryLimits;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.query.limit.SpecificationLimit;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
@ -35,18 +36,18 @@ import nu.marginalia.ranking.DomainRankings;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import spark.Spark;
import javax.annotation.CheckReturnValue;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import java.util.stream.IntStream;
import java.util.function.Function;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@ -96,38 +97,379 @@ public class IndexQueryServiceIntegrationTest {
}
@Test
public void willItBlend() throws Exception {
for (int i = 1; i < 512; i++) {
loadData(i);
public void testNoPositionsOnlyFlags() throws Exception {
// Test the case where positions are absent but flags are present
new MockData().add( // should be included despite no position
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", WordFlags.Title),
w("world", WordFlags.Title)
).load();
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
executeSearch(query)
.expectDocumentsInOrder(d(1,1));
}
@Test
public void testMissingKeywords() throws Exception {
// Test cases where the user enters search terms that are missing from the lexicon
new MockData().add(
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", WordFlags.Title),
w("world", WordFlags.Title)
).load();
var queryMissingExclude = basicQuery(builder ->
builder.subqueries(includeAndExclude("hello", "missing")));
executeSearch(queryMissingExclude)
.expectDocumentsInOrder(d(1,1));
var queryMissingInclude = basicQuery(builder ->
builder.subqueries(justInclude("missing")));
executeSearch(queryMissingInclude)
.expectCount(0);
var queryMissingPriority = basicQuery(builder ->
builder.subqueries(
List.of(
new SearchSubquery(
List.of("hello"),
List.of(),
List.of(),
List.of("missing"),
List.of()
)
)));
executeSearch(queryMissingPriority)
.expectCount(1);
var queryMissingAdvice = basicQuery(builder ->
builder.subqueries(
List.of(
new SearchSubquery(
List.of("hello"),
List.of(),
List.of("missing"),
List.of(),
List.of()
)
)));
executeSearch(queryMissingAdvice)
.expectCount(0);
var queryMissingCoherence = basicQuery(builder ->
builder.subqueries(
List.of(
new SearchSubquery(
List.of("hello"),
List.of(),
List.of(),
List.of(),
List.of(List.of("missing", "hello"))
)
)));
executeSearch(queryMissingCoherence)
.expectCount(0);
}
@Test
public void testPositions() throws Exception {
// Test position rules
new MockData()
.add( // Case 1: Both words have a position set, should be considered
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
).add( // Case 2: Only one of the words have a position set, should not be considered
d(2, 2),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode())
).load();
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
executeSearch(query)
.expectDocumentsInOrder(d(1,1));
}
@Test
public void testYear() throws Exception {
// Test year rules
new MockData()
.add( // Case 1: Document is dated 1999
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
).add( // Case 2: Document is dated 2000
d(2, 2),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
)
.add( // Case 2: Document is dated 2001
d(3, 3),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
)
.load();
var beforeY2K = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
.year(SpecificationLimit.lessThan(2000))
);
var atY2K = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
.year(SpecificationLimit.equals(2000))
);
var afterY2K = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
.year(SpecificationLimit.greaterThan(2000))
);
executeSearch(beforeY2K)
.expectDocumentsInOrder(
d(1,1),
d(2,2)
);
executeSearch(atY2K)
.expectDocumentsInOrder(
d(2,2)
);
executeSearch(afterY2K)
.expectDocumentsInOrder(
d(2,2),
d(3,3)
);
}
@Test
public void testDomain() throws Exception {
// Test domain filtering
new MockData()
// docs from domain 1
.add(
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
).add(
d(1, 2),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
)
// docs from domain 2
.add(
d(2, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
)
.add(
d(2, 2),
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
)
.load();
var domain1 = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
.domains(List.of(1))
);
var domain2 = basicQuery(builder ->
builder.subqueries(justInclude("hello", "world"))
.domains(List.of(2))
);
executeSearch(domain1)
.expectDocumentsInOrder(
d(1,1),
d(1,2)
);
executeSearch(domain2)
.expectDocumentsInOrder(
d(2,1),
d(2,2)
);
}
@Test
public void testExclude() throws Exception {
// Test exclude rules
new MockData()
.add( // Case 1: The required include is present, exclude is absent; should be a result
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
).add( // Case 2: The required include is present, excluded term is absent; should not be a result
d(2, 2),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("my_darling", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode())
).load();
var query = basicQuery(builder ->
builder.subqueries(includeAndExclude("hello", "my_darling"))
);
executeSearch(query)
.expectDocumentsInOrder(d(1,1));
}
static class ResultWrapper {
private final List<MockDataDocument> actual;
ResultWrapper(List<MockDataDocument> actual) {
this.actual = actual;
}
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
public ResultWrapper expectDocumentsInOrder(MockDataDocument... expectedDocs) {
assertEquals(List.of(expectedDocs), actual);
return this;
}
public ResultWrapper expectDocumentInAnyOrder(MockDataDocument... expectedDocs) {
assertEquals(Set.of(expectedDocs), new HashSet<>(actual));
return this;
}
public ResultWrapper expectCount(int count) {
assertEquals(count, actual.size());
return this;
}
}
@CheckReturnValue
ResultWrapper executeSearch(SearchSpecification searchSpecification) {
var rsp = queryService.justQuery(searchSpecification);
List<MockDataDocument> actual = new ArrayList<>();
System.out.println(rsp);
for (var result : rsp.results) {
long docId = result.getDocumentId();
actual.add(new MockDataDocument(UrlIdCodec.getDomainId(docId), UrlIdCodec.getDocumentOrdinal(docId)));
}
return new ResultWrapper(actual);
}
@Test
public void testCoherenceRequirement() throws Exception {
// Test coherence requirement. Two terms are considered coherent when they
// appear in the same position
new MockData()
.add( // Case 1: Both positions overlap; should be included
d(1, 1),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
)
.add( // Case 2: Positions do not overlap, do not include
d(2, 2),
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
w("world", new WordMetadata(2L, EnumSet.noneOf(WordFlags.class)).encode())
)
.load();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
basicQuery(builder -> builder.subqueries(
// note coherence requriement
includeAndCohere("hello", "world")
)));
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
long[] actual = rsp.results
.stream()
.mapToLong(SearchResultItem::getDocumentId)
.toArray();
assertEquals(1, rsp.results.size());
assertEquals(d(1,1).docId(),
rsp.results.get(0).getDocumentId());
}
Assertions.assertArrayEquals(ids, actual);
SearchSpecification basicQuery(Function<SearchSpecification.SearchSpecificationBuilder, SearchSpecification.SearchSpecificationBuilder> mutator)
{
var builder = SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.domains(new ArrayList<>())
.searchSetIdentifier(SearchSetIdentifier.NONE)
.subqueries(List.of());
return mutator.apply(builder).build();
}
List<SearchSubquery> justInclude(String... includes) {
return List.of(new SearchSubquery(
List.of(includes),
List.of(),
List.of(),
List.of(),
List.of()
));
}
List<SearchSubquery> includeAndExclude(List<String> includes, List<String> excludes) {
return List.of(new SearchSubquery(
includes,
excludes,
List.of(),
List.of(),
List.of()
));
}
List<SearchSubquery> includeAndExclude(String include, String exclude) {
return List.of(new SearchSubquery(
List.of(include),
List.of(exclude),
List.of(),
List.of(),
List.of()
));
}
List<SearchSubquery> includeAndCohere(String... includes) {
return List.of(new SearchSubquery(
List.of(includes),
List.of(),
List.of(),
List.of(),
List.of(List.of(includes))
));
}
private MockDataDocument d(int domainId, int ordinal) {
return new MockDataDocument(domainId, ordinal);
}
private void constructIndex() throws SQLException, IOException {
@ -186,110 +528,64 @@ public class IndexQueryServiceIntegrationTest {
converter.convert();
}
@Test
public void testDomainQuery() throws Exception {
for (int i = 1; i < 512; i++) {
loadDataWithDomain(i/100, i);
}
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.queryStrategy(QueryStrategy.SENTENCE)
.domains(List.of(2))
.subqueries(List.of(new SearchSubquery(
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))).build());
int[] idxes = new int[] { 210, 270 };
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray();
Assertions.assertArrayEquals(ids, actual);
}
@Test
public void testYearQuery() throws Exception {
for (int i = 1; i < 512; i++) {
loadData(i);
}
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.quality(SpecificationLimit.none())
.year(SpecificationLimit.equals(1998))
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.queryStrategy(QueryStrategy.SENTENCE)
.searchSetIdentifier(SearchSetIdentifier.NONE)
.rankingParams(ResultRankingParameters.sensibleDefaults())
.subqueries(List.of(new SearchSubquery(
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
Collections.emptyList()))
).build());
Set<Integer> years = new HashSet<>();
for (var res : rsp.results) {
for (var score : res.getKeywordScores()) {
years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata()));
}
}
assertEquals(Set.of(1998), years);
assertEquals(rsp.results.size(), 10);
}
private long fullId(int id) {
return UrlIdCodec.encodeId((32 - (id % 32)), id);
}
MurmurHash3_128 hasher = new MurmurHash3_128();
public void loadData(int id) {
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
.toArray();
long fullId = fullId(id);
class MockData {
private final Map<Long, List<MockDataKeyword>> allData = new HashMap<>();
private final Map<Long, MockDocumentMeta> metaByDoc = new HashMap<>();
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
public MockData add(MockDataDocument document,
MockDocumentMeta meta,
MockDataKeyword... words)
{
long id = UrlIdCodec.encodeId(document.domainId, document.ordinal);
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words));
metaByDoc.put(id, meta);
return this;
}
indexJournalWriter.put(header, new IndexJournalEntryData(data));
void load() throws IOException, SQLException {
allData.forEach((doc, words) -> {
var meta = metaByDoc.get(doc);
var header = new IndexJournalEntryHeader(
doc,
meta.features,
meta.documentMetadata.encode()
);
long[] dataArray = new long[words.size() * 2];
for (int i = 0; i < words.size(); i++) {
dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword);
dataArray[2*i+1] = words.get(i).termMetadata;
}
var entry = new IndexJournalEntryData(dataArray);
indexJournalWriter.put(header, entry);
});
indexJournalWriter.close();
constructIndex();
searchIndex.switchIndex();
}
}
public void loadDataWithDomain(int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue());
long[] data = new long[factors.length*2];
for (int i = 0; i < factors.length; i++) {
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
record MockDataDocument(int domainId, int ordinal) {
public long docId() {
return UrlIdCodec.encodeId(domainId, ordinal);
}
indexJournalWriter.put(header, new IndexJournalEntryData(data));
}
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {
public MockDocumentMeta(int features, long encoded) {
this(features, new DocumentMetadata(encoded));
}
}
record MockDataKeyword(String keyword, long termMetadata) {}
public MockDataKeyword w(String keyword, long termMetadata) { return new MockDataKeyword(keyword, termMetadata); }
public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L); }
public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode()); }
}

View File

@ -4,7 +4,6 @@ import com.google.inject.AbstractModule;
import nu.marginalia.db.storage.FileStorageService;
import nu.marginalia.db.storage.model.FileStorage;
import nu.marginalia.db.storage.model.FileStorageType;
import nu.marginalia.index.IndexServicesFactory;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.process.control.FakeProcessHeartbeat;
@ -35,7 +34,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
Random random = new Random();
public IndexQueryServiceIntegrationTestModule() throws IOException {
workDir = Files.createTempDirectory(IndexQueryServiceIntegrationTest.class.getSimpleName());
workDir = Files.createTempDirectory(IndexQueryServiceIntegrationSmokeTest.class.getSimpleName());
slowDir = workDir.resolve("slow");
fastDir = workDir.resolve("fast");