MarginaliaSearch/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java

294 lines
12 KiB
Java
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package nu.marginalia;
import com.google.inject.Guice;
import com.google.inject.Inject;
import nu.marginalia.api.searchquery.QueryProtobufCodec;
import nu.marginalia.api.searchquery.RpcQsQuery;
import nu.marginalia.api.searchquery.RpcQueryLimits;
import nu.marginalia.api.searchquery.RpcResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.PrototypeRankingParameters;
import nu.marginalia.converting.processor.DomainProcessor;
import nu.marginalia.converting.writer.ConverterBatchWriter;
import nu.marginalia.crawl.fetcher.ContentTags;
import nu.marginalia.crawl.fetcher.Cookies;
import nu.marginalia.crawl.fetcher.HttpFetcherImpl;
import nu.marginalia.crawl.fetcher.warc.WarcRecorder;
import nu.marginalia.functions.searchquery.QueryFactory;
import nu.marginalia.index.IndexGrpcService;
import nu.marginalia.index.ReverseIndexFullFileNames;
import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.searchset.SearchSetAny;
import nu.marginalia.io.CrawledDomainReader;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.loading.LoaderIndexJournalWriter;
import nu.marginalia.loading.LoaderInputData;
import nu.marginalia.loading.documents.DocumentLoaderService;
import nu.marginalia.loading.documents.KeywordLoaderService;
import nu.marginalia.loading.domains.CachingDomainIdRegistry;
import nu.marginalia.loading.domains.DomainIdRegistry;
import nu.marginalia.loading.links.DomainLinksLoaderService;
import nu.marginalia.model.EdgeDomain;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.parquet.crawldata.CrawledDocumentParquetRecordFileWriter;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.test.IntegrationTestModule;
import nu.marginalia.test.TestUtil;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.util.List;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.Mockito.when;
public class IntegrationTest {
IntegrationTestModule testModule;
@Inject
DomainProcessor domainProcessor;
@Inject
DomainLinksLoaderService linksService;
@Inject
KeywordLoaderService keywordLoaderService;
@Inject
DocumentLoaderService documentLoaderService;
@Inject
FileStorageService fileStorageService;
@Inject
DomainRankings domainRankings;
@Inject
DocumentDbWriter documentDbWriter;
@Inject
LoaderIndexJournalWriter journalWriter;
Path warcData = null;
Path crawlDataParquet = null;
Path processedDataDir = null;
@Inject
StatefulIndex statefulIndex;
@Inject
IndexGrpcService indexGrpcService;
@Inject
DocumentDbReader documentDbReader;
@Inject
QueryFactory queryFactory;
@BeforeEach
public void setupTest() throws IOException {
testModule = new IntegrationTestModule();
Guice.createInjector(testModule).injectMembers(this);
warcData = Files.createTempFile("warc", ".warc.gz");
crawlDataParquet = Files.createTempFile("crawl", ".parquet");
processedDataDir = Files.createTempDirectory("processed");
}
@AfterEach
public void tearDownTest() throws IOException {
Files.deleteIfExists(warcData);
Files.deleteIfExists(crawlDataParquet);
TestUtil.clearTempDir(processedDataDir);
testModule.cleanUp();
}
@Test
public void run() throws Exception {
/** CREATE WARC */
try (WarcRecorder warcRecorder = new WarcRecorder(warcData, new Cookies())) {
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
new HttpFetcherImpl.DomainProbeResult.Ok(new EdgeUrl("https://www.example.com/")));
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
"text/html", 200,
"""
<html>
<h1>Hello World</h1>
<body>
<p>The best description of my problem solving process is the Feynman algorithm, which is sometimes presented as a joke where the hidden subtext is “be smart”, but I disagree. The “algorithm” is a surprisingly lucid description of how thinking works in the context of hard problems where the answer cant simply be looked up or trivially broken down, iterated upon in a bottom-up fashion, or approached with similar methods.
The trick is that there is no trick. This is how thinking works. It appears that when you feed your brain related information, without further active involvement, it starts to digest the information youve fed it.
</p>
</body>
</html>
""",
"",
ContentTags.empty()
);
}
/** CONVERT WARC */
CrawledDocumentParquetRecordFileWriter.convertWarc(
"www.example.com",
new UserAgent("search.marginalia.nu",
"search.marginalia.nu"),
warcData,
crawlDataParquet);
/** PROCESS CRAWL DATA */
var processedDomain = domainProcessor.fullProcessing(CrawledDomainReader.createDataStream(crawlDataParquet));
System.out.println(processedDomain);
/** WRITE PROCESSED DATA */
try (ConverterBatchWriter cbw = new ConverterBatchWriter(processedDataDir, 0)) {
cbw.writeProcessedDomain(processedDomain);
}
// Write a single batch-switch marker in the process log so that the loader will read the data
Files.writeString(processedDataDir.resolve("processor.log"), "F\n", StandardOpenOption.CREATE_NEW);
/** LOAD PROCESSED DATA */
LoaderInputData inputData = new LoaderInputData(List.of(processedDataDir));
DomainIdRegistry domainIdRegistry = Mockito.mock(CachingDomainIdRegistry.class);
when(domainIdRegistry.getDomainId(any())).thenReturn(1);
linksService.loadLinks(domainIdRegistry, new FakeProcessHeartbeat(), inputData);
keywordLoaderService.loadKeywords(domainIdRegistry, new FakeProcessHeartbeat(), inputData);
documentLoaderService.loadDocuments(domainIdRegistry, new FakeProcessHeartbeat(), inputData);
// These must be closed to finalize the associated files
documentDbWriter.close();
keywordLoaderService.close();
/** CONSTRUCT INDEX */
createForwardIndex();
createFullReverseIndex();
createPrioReverseIndex();
/** SWITCH INDEX */
statefulIndex.switchIndex();
// Move the docdb file to the live location
Files.move(
IndexLocations.getLinkdbWritePath(fileStorageService).resolve(DOCDB_FILE_NAME),
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
);
// Reconnect the document reader to the new docdb file
documentDbReader.reconnect();
/** QUERY */
var request = RpcQsQuery.newBuilder()
.setQueryLimits(RpcQueryLimits.newBuilder()
.setTimeoutMs(1000)
.setResultsTotal(100)
.setResultsByDomain(10)
.setFetchSize(1000)
.build())
.setQueryStrategy("AUTO")
.setHumanQuery("\"is that there is\"")
.build();
var params = QueryProtobufCodec.convertRequest(request);
var p = RpcResultRankingParameters.newBuilder(PrototypeRankingParameters.sensibleDefaults()).setExportDebugData(true).build();
var query = queryFactory.createQuery(params, p);
var indexRequest = QueryProtobufCodec.convertQuery(request, query);
System.out.println(indexRequest);
var rs = indexGrpcService.executeSearch(new SearchParameters(indexRequest, new SearchSetAny()));
System.out.println(rs);
}
private void createFullReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new FullIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
this::addRankToIdEncoding,
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexFull", workDir);
}
private void createPrioReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
var constructor = new PrioIndexConstructor(
outputFileDocs,
outputFileWords,
this::addRankToIdEncoding,
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexPrio", workDir);
}
private void createForwardIndex() throws IOException {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileSpansData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.SPANS_DATA, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(new FakeProcessHeartbeat(),
outputFileDocsId,
outputFileDocsData,
outputFileSpansData,
IndexJournal.findJournal(workDir).orElseThrow(),
domainRankings
);
converter.convert();
}
private long addRankToIdEncoding(long docId) {
return UrlIdCodec.addRank(
255,
docId);
}
}