mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 13:19:02 +00:00
(index) Add more comprehensive integration tests for the index service.
This commit is contained in:
parent
048f685073
commit
764e7d1315
@ -2,12 +2,14 @@ package nu.marginalia.index.client.model.query;
|
|||||||
|
|
||||||
import lombok.AllArgsConstructor;
|
import lombok.AllArgsConstructor;
|
||||||
import lombok.Getter;
|
import lombok.Getter;
|
||||||
|
import lombok.With;
|
||||||
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
@Getter
|
@Getter
|
||||||
@AllArgsConstructor
|
@AllArgsConstructor
|
||||||
|
@With
|
||||||
public class SearchSubquery {
|
public class SearchSubquery {
|
||||||
|
|
||||||
/** These terms must be present in the document and are used in ranking*/
|
/** These terms must be present in the document and are used in ranking*/
|
||||||
@ -27,6 +29,14 @@ public class SearchSubquery {
|
|||||||
|
|
||||||
private double value = 0;
|
private double value = 0;
|
||||||
|
|
||||||
|
public SearchSubquery() {
|
||||||
|
this.searchTermsInclude = List.of();
|
||||||
|
this.searchTermsExclude = List.of();
|
||||||
|
this.searchTermsAdvice = List.of();
|
||||||
|
this.searchTermsPriority = List.of();
|
||||||
|
this.searchTermCoherences = List.of();
|
||||||
|
}
|
||||||
|
|
||||||
public SearchSubquery(List<String> searchTermsInclude,
|
public SearchSubquery(List<String> searchTermsInclude,
|
||||||
List<String> searchTermsExclude,
|
List<String> searchTermsExclude,
|
||||||
List<String> searchTermsAdvice,
|
List<String> searchTermsAdvice,
|
||||||
|
@ -185,6 +185,8 @@ public class ReversePreindex {
|
|||||||
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||||
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||||
|
|
||||||
|
// We need total size to request a direct LongArray range. Seems slower, but is faster.
|
||||||
|
// ... see LongArray.directRangeIfPossible(long start, long end)
|
||||||
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
|
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
|
||||||
0, left.wordIds.size(),
|
0, left.wordIds.size(),
|
||||||
0, right.wordIds.size());
|
0, right.wordIds.size());
|
||||||
|
@ -139,7 +139,8 @@ public class IndexMetadataService {
|
|||||||
for (var coherenceSet : coherences.words()) {
|
for (var coherenceSet : coherences.words()) {
|
||||||
long overlap = 0xFF_FFFF_FFFF_FFFFL;
|
long overlap = 0xFF_FFFF_FFFF_FFFFL;
|
||||||
for (var word : coherenceSet) {
|
for (var word : coherenceSet) {
|
||||||
overlap &= WordMetadata.decodePositions(getTermMetadata(word, docId));
|
long positions = WordMetadata.decodePositions(getTermMetadata(word, docId));
|
||||||
|
overlap &= positions;
|
||||||
}
|
}
|
||||||
if (overlap == 0L) {
|
if (overlap == 0L) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -0,0 +1,299 @@
|
|||||||
|
package nu.marginalia.index.svc;
|
||||||
|
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorage;
|
||||||
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||||
|
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
|
import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
||||||
|
import nu.marginalia.index.client.model.results.SearchResultItem;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||||
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.index.SearchIndex;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
|
import nu.marginalia.service.server.Initialization;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.Assertions;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
import spark.Spark;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
import java.util.function.Function;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||||
|
|
||||||
|
@Execution(SAME_THREAD)
|
||||||
|
public class IndexQueryServiceIntegrationSmokeTest {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
Initialization initialization;
|
||||||
|
|
||||||
|
IndexQueryServiceIntegrationTestModule testModule;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
IndexQueryService queryService;
|
||||||
|
@Inject
|
||||||
|
SearchIndex searchIndex;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
ServiceHeartbeat heartbeat;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
IndexJournalWriter indexJournalWriter;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
FileStorageService fileStorageService;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
DomainRankings domainRankings;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
ProcessHeartbeat processHeartbeat;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
|
||||||
|
testModule = new IndexQueryServiceIntegrationTestModule();
|
||||||
|
Guice.createInjector(testModule).injectMembers(this);
|
||||||
|
|
||||||
|
initialization.setReady();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
testModule.cleanUp();
|
||||||
|
|
||||||
|
Spark.stop();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void willItBlend() throws Exception {
|
||||||
|
for (int i = 1; i < 512; i++) {
|
||||||
|
loadData(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
indexJournalWriter.close();
|
||||||
|
constructIndex();
|
||||||
|
searchIndex.switchIndex();
|
||||||
|
|
||||||
|
var rsp = queryService.justQuery(
|
||||||
|
SearchSpecification.builder()
|
||||||
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
|
.year(SpecificationLimit.none())
|
||||||
|
.quality(SpecificationLimit.none())
|
||||||
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
|
.domains(new ArrayList<>())
|
||||||
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
|
.subqueries(List.of(new SearchSubquery(
|
||||||
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||||
|
Collections.emptyList()))).build());
|
||||||
|
|
||||||
|
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
|
||||||
|
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
|
||||||
|
long[] actual = rsp.results
|
||||||
|
.stream()
|
||||||
|
.mapToLong(SearchResultItem::getDocumentId)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
Assertions.assertArrayEquals(ids, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDomainQuery() throws Exception {
|
||||||
|
for (int i = 1; i < 512; i++) {
|
||||||
|
loadDataWithDomain(i/100, i);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
indexJournalWriter.close();
|
||||||
|
constructIndex();
|
||||||
|
searchIndex.switchIndex();
|
||||||
|
|
||||||
|
var rsp = queryService.justQuery(
|
||||||
|
SearchSpecification.builder()
|
||||||
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
|
.year(SpecificationLimit.none())
|
||||||
|
.quality(SpecificationLimit.none())
|
||||||
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
|
.domains(List.of(2))
|
||||||
|
.subqueries(List.of(new SearchSubquery(
|
||||||
|
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
||||||
|
Collections.emptyList()))).build());
|
||||||
|
int[] idxes = new int[] { 210, 270 };
|
||||||
|
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
|
||||||
|
long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray();
|
||||||
|
|
||||||
|
Assertions.assertArrayEquals(ids, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testYearQuery() throws Exception {
|
||||||
|
for (int i = 1; i < 512; i++) {
|
||||||
|
loadData(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
indexJournalWriter.close();
|
||||||
|
constructIndex();
|
||||||
|
searchIndex.switchIndex();
|
||||||
|
|
||||||
|
var rsp = queryService.justQuery(
|
||||||
|
SearchSpecification.builder()
|
||||||
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
|
.quality(SpecificationLimit.none())
|
||||||
|
.year(SpecificationLimit.equals(1998))
|
||||||
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
|
.subqueries(List.of(new SearchSubquery(
|
||||||
|
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
|
||||||
|
Collections.emptyList()))
|
||||||
|
).build());
|
||||||
|
|
||||||
|
|
||||||
|
Set<Integer> years = new HashSet<>();
|
||||||
|
|
||||||
|
for (var res : rsp.results) {
|
||||||
|
for (var score : res.getKeywordScores()) {
|
||||||
|
years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(Set.of(1998), years);
|
||||||
|
assertEquals(rsp.results.size(), 10);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void constructIndex() throws SQLException, IOException {
|
||||||
|
createForwardIndex();
|
||||||
|
createFullReverseIndex();
|
||||||
|
createPrioReverseIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void createFullReverseIndex() throws SQLException, IOException {
|
||||||
|
|
||||||
|
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
||||||
|
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
|
Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
|
Path tmpDir = indexStaging.asPath().resolve("tmp");
|
||||||
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
|
|
||||||
|
ReverseIndexConstructor.
|
||||||
|
createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createPrioReverseIndex() throws SQLException, IOException {
|
||||||
|
|
||||||
|
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
||||||
|
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
|
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
|
Path tmpDir = indexStaging.asPath().resolve("tmp");
|
||||||
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
|
ReverseIndexConstructor.
|
||||||
|
createReverseIndex(new FakeProcessHeartbeat(), IndexJournalReader::singleFile, indexStaging.asPath(), DocIdRewriter.identity(), tmpDir, outputFileDocs, outputFileWords);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createForwardIndex() throws SQLException, IOException {
|
||||||
|
|
||||||
|
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
||||||
|
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
|
Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
|
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
|
||||||
|
IndexJournalReader.paging(indexStaging.asPath()),
|
||||||
|
outputFileDocsId,
|
||||||
|
outputFileDocsData,
|
||||||
|
domainRankings
|
||||||
|
);
|
||||||
|
|
||||||
|
converter.convert();
|
||||||
|
}
|
||||||
|
|
||||||
|
private long fullId(int id) {
|
||||||
|
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||||
|
}
|
||||||
|
|
||||||
|
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||||
|
public void loadData(int id) {
|
||||||
|
int[] factors = IntStream
|
||||||
|
.rangeClosed(1, id)
|
||||||
|
.filter(v -> (id % v) == 0)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
long fullId = fullId(id);
|
||||||
|
|
||||||
|
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
|
long[] data = new long[factors.length*2];
|
||||||
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
||||||
|
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||||
|
}
|
||||||
|
|
||||||
|
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
public void loadDataWithDomain(int domain, int id) {
|
||||||
|
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
|
var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue());
|
||||||
|
|
||||||
|
long[] data = new long[factors.length*2];
|
||||||
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
||||||
|
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||||
|
}
|
||||||
|
|
||||||
|
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -8,11 +8,10 @@ import nu.marginalia.db.storage.model.FileStorageType;
|
|||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||||
|
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
||||||
import nu.marginalia.index.client.model.query.SearchSpecification;
|
import nu.marginalia.index.client.model.query.SearchSpecification;
|
||||||
import nu.marginalia.index.client.model.query.SearchSubquery;
|
import nu.marginalia.index.client.model.query.SearchSubquery;
|
||||||
import nu.marginalia.index.client.model.query.SearchSetIdentifier;
|
|
||||||
import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
import nu.marginalia.index.client.model.results.ResultRankingParameters;
|
||||||
import nu.marginalia.index.client.model.results.SearchResultItem;
|
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
@ -25,9 +24,11 @@ import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
|||||||
import nu.marginalia.index.query.limit.QueryLimits;
|
import nu.marginalia.index.query.limit.QueryLimits;
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||||
|
import nu.marginalia.model.crawl.PubDate;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
@ -35,18 +36,18 @@ import nu.marginalia.ranking.DomainRankings;
|
|||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import nu.marginalia.service.server.Initialization;
|
import nu.marginalia.service.server.Initialization;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.junit.jupiter.api.parallel.Execution;
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
import spark.Spark;
|
import spark.Spark;
|
||||||
|
|
||||||
|
import javax.annotation.CheckReturnValue;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.IntStream;
|
import java.util.function.Function;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||||
@ -96,38 +97,379 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void willItBlend() throws Exception {
|
public void testNoPositionsOnlyFlags() throws Exception {
|
||||||
for (int i = 1; i < 512; i++) {
|
// Test the case where positions are absent but flags are present
|
||||||
loadData(i);
|
|
||||||
|
new MockData().add( // should be included despite no position
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", WordFlags.Title),
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
).load();
|
||||||
|
|
||||||
|
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
|
||||||
|
|
||||||
|
executeSearch(query)
|
||||||
|
.expectDocumentsInOrder(d(1,1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMissingKeywords() throws Exception {
|
||||||
|
// Test cases where the user enters search terms that are missing from the lexicon
|
||||||
|
|
||||||
|
new MockData().add(
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", WordFlags.Title),
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
).load();
|
||||||
|
|
||||||
|
var queryMissingExclude = basicQuery(builder ->
|
||||||
|
builder.subqueries(includeAndExclude("hello", "missing")));
|
||||||
|
|
||||||
|
executeSearch(queryMissingExclude)
|
||||||
|
.expectDocumentsInOrder(d(1,1));
|
||||||
|
|
||||||
|
var queryMissingInclude = basicQuery(builder ->
|
||||||
|
builder.subqueries(justInclude("missing")));
|
||||||
|
|
||||||
|
executeSearch(queryMissingInclude)
|
||||||
|
.expectCount(0);
|
||||||
|
|
||||||
|
var queryMissingPriority = basicQuery(builder ->
|
||||||
|
builder.subqueries(
|
||||||
|
List.of(
|
||||||
|
new SearchSubquery(
|
||||||
|
List.of("hello"),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of("missing"),
|
||||||
|
List.of()
|
||||||
|
)
|
||||||
|
)));
|
||||||
|
|
||||||
|
executeSearch(queryMissingPriority)
|
||||||
|
.expectCount(1);
|
||||||
|
|
||||||
|
var queryMissingAdvice = basicQuery(builder ->
|
||||||
|
builder.subqueries(
|
||||||
|
List.of(
|
||||||
|
new SearchSubquery(
|
||||||
|
List.of("hello"),
|
||||||
|
List.of(),
|
||||||
|
List.of("missing"),
|
||||||
|
List.of(),
|
||||||
|
List.of()
|
||||||
|
)
|
||||||
|
)));
|
||||||
|
|
||||||
|
executeSearch(queryMissingAdvice)
|
||||||
|
.expectCount(0);
|
||||||
|
|
||||||
|
var queryMissingCoherence = basicQuery(builder ->
|
||||||
|
builder.subqueries(
|
||||||
|
List.of(
|
||||||
|
new SearchSubquery(
|
||||||
|
List.of("hello"),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(List.of("missing", "hello"))
|
||||||
|
)
|
||||||
|
)));
|
||||||
|
|
||||||
|
executeSearch(queryMissingCoherence)
|
||||||
|
.expectCount(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testPositions() throws Exception {
|
||||||
|
|
||||||
|
// Test position rules
|
||||||
|
new MockData()
|
||||||
|
.add( // Case 1: Both words have a position set, should be considered
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
).add( // Case 2: Only one of the words have a position set, should not be considered
|
||||||
|
d(2, 2),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
).load();
|
||||||
|
|
||||||
|
|
||||||
|
var query = basicQuery(builder -> builder.subqueries(justInclude("hello", "world")));
|
||||||
|
|
||||||
|
executeSearch(query)
|
||||||
|
.expectDocumentsInOrder(d(1,1));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testYear() throws Exception {
|
||||||
|
|
||||||
|
// Test year rules
|
||||||
|
new MockData()
|
||||||
|
.add( // Case 1: Document is dated 1999
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
).add( // Case 2: Document is dated 2000
|
||||||
|
d(2, 2),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
.add( // Case 2: Document is dated 2001
|
||||||
|
d(3, 3),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
.load();
|
||||||
|
|
||||||
|
|
||||||
|
var beforeY2K = basicQuery(builder ->
|
||||||
|
builder.subqueries(justInclude("hello", "world"))
|
||||||
|
.year(SpecificationLimit.lessThan(2000))
|
||||||
|
);
|
||||||
|
var atY2K = basicQuery(builder ->
|
||||||
|
builder.subqueries(justInclude("hello", "world"))
|
||||||
|
.year(SpecificationLimit.equals(2000))
|
||||||
|
);
|
||||||
|
var afterY2K = basicQuery(builder ->
|
||||||
|
builder.subqueries(justInclude("hello", "world"))
|
||||||
|
.year(SpecificationLimit.greaterThan(2000))
|
||||||
|
);
|
||||||
|
|
||||||
|
executeSearch(beforeY2K)
|
||||||
|
.expectDocumentsInOrder(
|
||||||
|
d(1,1),
|
||||||
|
d(2,2)
|
||||||
|
);
|
||||||
|
executeSearch(atY2K)
|
||||||
|
.expectDocumentsInOrder(
|
||||||
|
d(2,2)
|
||||||
|
);
|
||||||
|
executeSearch(afterY2K)
|
||||||
|
.expectDocumentsInOrder(
|
||||||
|
d(2,2),
|
||||||
|
d(3,3)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDomain() throws Exception {
|
||||||
|
|
||||||
|
// Test domain filtering
|
||||||
|
new MockData()
|
||||||
|
// docs from domain 1
|
||||||
|
.add(
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(1999), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
).add(
|
||||||
|
d(1, 2),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2000), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
// docs from domain 2
|
||||||
|
.add(
|
||||||
|
d(2, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(2, 2),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, PubDate.toYearByte(2001), 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
.load();
|
||||||
|
|
||||||
|
|
||||||
|
var domain1 = basicQuery(builder ->
|
||||||
|
builder.subqueries(justInclude("hello", "world"))
|
||||||
|
.domains(List.of(1))
|
||||||
|
);
|
||||||
|
var domain2 = basicQuery(builder ->
|
||||||
|
builder.subqueries(justInclude("hello", "world"))
|
||||||
|
.domains(List.of(2))
|
||||||
|
);
|
||||||
|
|
||||||
|
executeSearch(domain1)
|
||||||
|
.expectDocumentsInOrder(
|
||||||
|
d(1,1),
|
||||||
|
d(1,2)
|
||||||
|
);
|
||||||
|
executeSearch(domain2)
|
||||||
|
.expectDocumentsInOrder(
|
||||||
|
d(2,1),
|
||||||
|
d(2,2)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testExclude() throws Exception {
|
||||||
|
|
||||||
|
// Test exclude rules
|
||||||
|
new MockData()
|
||||||
|
.add( // Case 1: The required include is present, exclude is absent; should be a result
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
).add( // Case 2: The required include is present, excluded term is absent; should not be a result
|
||||||
|
d(2, 2),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("my_darling", new WordMetadata(0L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
).load();
|
||||||
|
|
||||||
|
var query = basicQuery(builder ->
|
||||||
|
builder.subqueries(includeAndExclude("hello", "my_darling"))
|
||||||
|
);
|
||||||
|
|
||||||
|
executeSearch(query)
|
||||||
|
.expectDocumentsInOrder(d(1,1));
|
||||||
|
}
|
||||||
|
|
||||||
|
static class ResultWrapper {
|
||||||
|
private final List<MockDataDocument> actual;
|
||||||
|
|
||||||
|
ResultWrapper(List<MockDataDocument> actual) {
|
||||||
|
this.actual = actual;
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.close();
|
public ResultWrapper expectDocumentsInOrder(MockDataDocument... expectedDocs) {
|
||||||
constructIndex();
|
assertEquals(List.of(expectedDocs), actual);
|
||||||
searchIndex.switchIndex();
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public ResultWrapper expectDocumentInAnyOrder(MockDataDocument... expectedDocs) {
|
||||||
|
assertEquals(Set.of(expectedDocs), new HashSet<>(actual));
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
public ResultWrapper expectCount(int count) {
|
||||||
|
assertEquals(count, actual.size());
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@CheckReturnValue
|
||||||
|
ResultWrapper executeSearch(SearchSpecification searchSpecification) {
|
||||||
|
var rsp = queryService.justQuery(searchSpecification);
|
||||||
|
|
||||||
|
List<MockDataDocument> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
System.out.println(rsp);
|
||||||
|
|
||||||
|
for (var result : rsp.results) {
|
||||||
|
long docId = result.getDocumentId();
|
||||||
|
actual.add(new MockDataDocument(UrlIdCodec.getDomainId(docId), UrlIdCodec.getDocumentOrdinal(docId)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ResultWrapper(actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCoherenceRequirement() throws Exception {
|
||||||
|
|
||||||
|
// Test coherence requirement. Two terms are considered coherent when they
|
||||||
|
// appear in the same position
|
||||||
|
new MockData()
|
||||||
|
.add( // Case 1: Both positions overlap; should be included
|
||||||
|
d(1, 1),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
.add( // Case 2: Positions do not overlap, do not include
|
||||||
|
d(2, 2),
|
||||||
|
new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class))),
|
||||||
|
w("hello", new WordMetadata(1L, EnumSet.noneOf(WordFlags.class)).encode()),
|
||||||
|
w("world", new WordMetadata(2L, EnumSet.noneOf(WordFlags.class)).encode())
|
||||||
|
)
|
||||||
|
.load();
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
var rsp = queryService.justQuery(
|
||||||
SearchSpecification.builder()
|
basicQuery(builder -> builder.subqueries(
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
// note coherence requriement
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
includeAndCohere("hello", "world")
|
||||||
.year(SpecificationLimit.none())
|
)));
|
||||||
.quality(SpecificationLimit.none())
|
|
||||||
.size(SpecificationLimit.none())
|
|
||||||
.rank(SpecificationLimit.none())
|
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
|
||||||
.domains(new ArrayList<>())
|
|
||||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
|
||||||
.subqueries(List.of(new SearchSubquery(
|
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
|
||||||
Collections.emptyList()))).build());
|
|
||||||
|
|
||||||
int[] idxes = new int[] { 30, 510, 90, 150, 210, 270, 330, 390, 450 };
|
assertEquals(1, rsp.results.size());
|
||||||
long[] ids = IntStream.of(idxes).mapToLong(this::fullId).toArray();
|
assertEquals(d(1,1).docId(),
|
||||||
long[] actual = rsp.results
|
rsp.results.get(0).getDocumentId());
|
||||||
.stream()
|
}
|
||||||
.mapToLong(SearchResultItem::getDocumentId)
|
|
||||||
.toArray();
|
|
||||||
|
|
||||||
Assertions.assertArrayEquals(ids, actual);
|
SearchSpecification basicQuery(Function<SearchSpecification.SearchSpecificationBuilder, SearchSpecification.SearchSpecificationBuilder> mutator)
|
||||||
|
{
|
||||||
|
var builder = SearchSpecification.builder()
|
||||||
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
|
.year(SpecificationLimit.none())
|
||||||
|
.quality(SpecificationLimit.none())
|
||||||
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
|
.domains(new ArrayList<>())
|
||||||
|
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
||||||
|
.subqueries(List.of());
|
||||||
|
|
||||||
|
return mutator.apply(builder).build();
|
||||||
|
}
|
||||||
|
|
||||||
|
List<SearchSubquery> justInclude(String... includes) {
|
||||||
|
return List.of(new SearchSubquery(
|
||||||
|
List.of(includes),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
List<SearchSubquery> includeAndExclude(List<String> includes, List<String> excludes) {
|
||||||
|
return List.of(new SearchSubquery(
|
||||||
|
includes,
|
||||||
|
excludes,
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
List<SearchSubquery> includeAndExclude(String include, String exclude) {
|
||||||
|
return List.of(new SearchSubquery(
|
||||||
|
List.of(include),
|
||||||
|
List.of(exclude),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of()
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
List<SearchSubquery> includeAndCohere(String... includes) {
|
||||||
|
return List.of(new SearchSubquery(
|
||||||
|
List.of(includes),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(),
|
||||||
|
List.of(List.of(includes))
|
||||||
|
));
|
||||||
|
}
|
||||||
|
private MockDataDocument d(int domainId, int ordinal) {
|
||||||
|
return new MockDataDocument(domainId, ordinal);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void constructIndex() throws SQLException, IOException {
|
private void constructIndex() throws SQLException, IOException {
|
||||||
@ -186,110 +528,64 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
converter.convert();
|
converter.convert();
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testDomainQuery() throws Exception {
|
|
||||||
for (int i = 1; i < 512; i++) {
|
|
||||||
loadDataWithDomain(i/100, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
indexJournalWriter.close();
|
|
||||||
constructIndex();
|
|
||||||
searchIndex.switchIndex();
|
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
|
||||||
SearchSpecification.builder()
|
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
|
||||||
.year(SpecificationLimit.none())
|
|
||||||
.quality(SpecificationLimit.none())
|
|
||||||
.size(SpecificationLimit.none())
|
|
||||||
.rank(SpecificationLimit.none())
|
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
|
||||||
.domains(List.of(2))
|
|
||||||
.subqueries(List.of(new SearchSubquery(
|
|
||||||
List.of("3", "5", "2"), List.of("4"), Collections.emptyList(), Collections.emptyList(),
|
|
||||||
Collections.emptyList()))).build());
|
|
||||||
int[] idxes = new int[] { 210, 270 };
|
|
||||||
long[] ids = IntStream.of(idxes).mapToLong(id -> UrlIdCodec.encodeId(id/100, id)).toArray();
|
|
||||||
long[] actual = rsp.results.stream().mapToLong(SearchResultItem::getDocumentId).toArray();
|
|
||||||
|
|
||||||
Assertions.assertArrayEquals(ids, actual);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testYearQuery() throws Exception {
|
|
||||||
for (int i = 1; i < 512; i++) {
|
|
||||||
loadData(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
indexJournalWriter.close();
|
|
||||||
constructIndex();
|
|
||||||
searchIndex.switchIndex();
|
|
||||||
|
|
||||||
var rsp = queryService.justQuery(
|
|
||||||
SearchSpecification.builder()
|
|
||||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
|
||||||
.quality(SpecificationLimit.none())
|
|
||||||
.year(SpecificationLimit.equals(1998))
|
|
||||||
.size(SpecificationLimit.none())
|
|
||||||
.rank(SpecificationLimit.none())
|
|
||||||
.queryStrategy(QueryStrategy.SENTENCE)
|
|
||||||
.searchSetIdentifier(SearchSetIdentifier.NONE)
|
|
||||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
|
||||||
.subqueries(List.of(new SearchSubquery(
|
|
||||||
List.of("4"), Collections.emptyList(), Collections.emptyList(), Collections.emptyList(),
|
|
||||||
Collections.emptyList()))
|
|
||||||
).build());
|
|
||||||
|
|
||||||
|
|
||||||
Set<Integer> years = new HashSet<>();
|
|
||||||
|
|
||||||
for (var res : rsp.results) {
|
|
||||||
for (var score : res.getKeywordScores()) {
|
|
||||||
years.add(DocumentMetadata.decodeYear(score.encodedDocMetadata()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
assertEquals(Set.of(1998), years);
|
|
||||||
assertEquals(rsp.results.size(), 10);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private long fullId(int id) {
|
|
||||||
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
|
||||||
}
|
|
||||||
|
|
||||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||||
public void loadData(int id) {
|
|
||||||
int[] factors = IntStream
|
|
||||||
.rangeClosed(1, id)
|
|
||||||
.filter(v -> (id % v) == 0)
|
|
||||||
.toArray();
|
|
||||||
|
|
||||||
long fullId = fullId(id);
|
class MockData {
|
||||||
|
private final Map<Long, List<MockDataKeyword>> allData = new HashMap<>();
|
||||||
|
private final Map<Long, MockDocumentMeta> metaByDoc = new HashMap<>();
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
public MockData add(MockDataDocument document,
|
||||||
|
MockDocumentMeta meta,
|
||||||
|
MockDataKeyword... words)
|
||||||
|
{
|
||||||
|
long id = UrlIdCodec.encodeId(document.domainId, document.ordinal);
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words));
|
||||||
for (int i = 0; i < factors.length; i++) {
|
metaByDoc.put(id, meta);
|
||||||
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
|
||||||
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
void load() throws IOException, SQLException {
|
||||||
|
allData.forEach((doc, words) -> {
|
||||||
|
|
||||||
|
var meta = metaByDoc.get(doc);
|
||||||
|
|
||||||
|
var header = new IndexJournalEntryHeader(
|
||||||
|
doc,
|
||||||
|
meta.features,
|
||||||
|
meta.documentMetadata.encode()
|
||||||
|
);
|
||||||
|
|
||||||
|
long[] dataArray = new long[words.size() * 2];
|
||||||
|
for (int i = 0; i < words.size(); i++) {
|
||||||
|
dataArray[2*i] = hasher.hashNearlyASCII(words.get(i).keyword);
|
||||||
|
dataArray[2*i+1] = words.get(i).termMetadata;
|
||||||
|
}
|
||||||
|
var entry = new IndexJournalEntryData(dataArray);
|
||||||
|
indexJournalWriter.put(header, entry);
|
||||||
|
});
|
||||||
|
|
||||||
|
indexJournalWriter.close();
|
||||||
|
constructIndex();
|
||||||
|
searchIndex.switchIndex();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void loadDataWithDomain(int domain, int id) {
|
record MockDataDocument(int domainId, int ordinal) {
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
public long docId() {
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, UrlIdCodec.encodeId(domain, id), DocumentMetadata.defaultValue());
|
return UrlIdCodec.encodeId(domainId, ordinal);
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
data[2*i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
|
||||||
data[2*i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(data));
|
|
||||||
}
|
}
|
||||||
|
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {
|
||||||
|
public MockDocumentMeta(int features, long encoded) {
|
||||||
|
this(features, new DocumentMetadata(encoded));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
record MockDataKeyword(String keyword, long termMetadata) {}
|
||||||
|
|
||||||
|
public MockDataKeyword w(String keyword, long termMetadata) { return new MockDataKeyword(keyword, termMetadata); }
|
||||||
|
public MockDataKeyword w(String keyword) { return new MockDataKeyword(keyword, 0L); }
|
||||||
|
public MockDataKeyword w(String keyword, WordFlags flags) { return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode()); }
|
||||||
}
|
}
|
||||||
|
@ -4,7 +4,6 @@ import com.google.inject.AbstractModule;
|
|||||||
import nu.marginalia.db.storage.FileStorageService;
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
import nu.marginalia.db.storage.model.FileStorage;
|
import nu.marginalia.db.storage.model.FileStorage;
|
||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
import nu.marginalia.index.IndexServicesFactory;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
@ -35,7 +34,7 @@ public class IndexQueryServiceIntegrationTestModule extends AbstractModule {
|
|||||||
Random random = new Random();
|
Random random = new Random();
|
||||||
|
|
||||||
public IndexQueryServiceIntegrationTestModule() throws IOException {
|
public IndexQueryServiceIntegrationTestModule() throws IOException {
|
||||||
workDir = Files.createTempDirectory(IndexQueryServiceIntegrationTest.class.getSimpleName());
|
workDir = Files.createTempDirectory(IndexQueryServiceIntegrationSmokeTest.class.getSimpleName());
|
||||||
slowDir = workDir.resolve("slow");
|
slowDir = workDir.resolve("slow");
|
||||||
fastDir = workDir.resolve("fast");
|
fastDir = workDir.resolve("fast");
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user