mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-22 20:48:59 +00:00
(test) Integration test from crawl->query
This commit is contained in:
parent
9d00243d7f
commit
dae22ccbe0
@ -1,13 +1,18 @@
|
||||
package nu.marginalia.process.control;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/** Dummy implementation of ProcessHeartbeat that does nothing */
|
||||
public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FakeProcessHeartbeat.class);
|
||||
@Override
|
||||
public <T extends Enum<T>> ProcessTaskHeartbeat<T> createProcessTaskHeartbeat(Class<T> steps, String processName) {
|
||||
return new ProcessTaskHeartbeat<>() {
|
||||
@Override
|
||||
public void progress(T step) {}
|
||||
public void progress(T step) {
|
||||
logger.info("Progress: {}", step);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutDown() {}
|
||||
@ -21,7 +26,9 @@ public class FakeProcessHeartbeat implements ProcessHeartbeat {
|
||||
public ProcessAdHocTaskHeartbeat createAdHocTaskHeartbeat(String processName) {
|
||||
return new ProcessAdHocTaskHeartbeat() {
|
||||
@Override
|
||||
public void progress(String step, int progress, int total) {}
|
||||
public void progress(String step, int progress, int total) {
|
||||
logger.info("Progress: {}, {}/{}", step, progress, total);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {}
|
||||
|
@ -124,7 +124,9 @@ public class DocumentKeywordsBuilder {
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder("[ ");
|
||||
wordToMeta.forEach((word, meta) -> sb.append(word).append("->").append(new WordMetadata(meta)).append(' '));
|
||||
wordToMeta.forEach((word, meta) -> {
|
||||
sb.append(word).append("->").append(new WordMetadata(meta).flagSet()).append(',').append(wordToPos.getOrDefault(word, new IntArrayList())).append(' ');
|
||||
});
|
||||
return sb.append(']').toString();
|
||||
}
|
||||
|
||||
|
@ -212,12 +212,21 @@ public class QueryFactoryTest {
|
||||
var subquery = parseAndGetSpecs("The");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
} @Test
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion6() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("burning the nerves in the neck");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testExpansion7() {
|
||||
long start = System.currentTimeMillis();
|
||||
var subquery = parseAndGetSpecs("amazing work being done");
|
||||
System.out.println("Time: " + (System.currentTimeMillis() - start));
|
||||
System.out.println(subquery);
|
||||
}
|
||||
}
|
@ -31,10 +31,10 @@ public class IndexJournalReaderSingleFile implements IndexJournalReader {
|
||||
|
||||
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
|
||||
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
|
||||
long recordCount = raf.readLong();
|
||||
long unused = raf.readLong();
|
||||
long wordCount = raf.readLong();
|
||||
|
||||
return new IndexJournalFileHeader(unused, wordCount);
|
||||
return new IndexJournalFileHeader(recordCount, unused);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -205,7 +205,8 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
return searchSetsService.getSearchSetByName(request.getSearchSetIdentifier());
|
||||
}
|
||||
|
||||
private SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException {
|
||||
// accessible for tests
|
||||
public SearchResultSet executeSearch(SearchParameters params) throws SQLException, InterruptedException {
|
||||
|
||||
if (!statefulIndex.isLoaded()) {
|
||||
// Short-circuit if the index is not loaded, as we trivially know that there can be no results
|
||||
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.converting.writer;
|
||||
|
||||
import gnu.trove.list.TLongList;
|
||||
import gnu.trove.list.array.TLongArrayList;
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.converting.model.ProcessedDocument;
|
||||
@ -103,8 +102,6 @@ public class ConverterBatchWriter implements AutoCloseable, ConverterBatchWriter
|
||||
|
||||
String domainName = domain.toString();
|
||||
|
||||
ByteBuffer workArea = ByteBuffer.allocate(1024);
|
||||
|
||||
while (documentIterator.hasNext()) {
|
||||
var document = documentIterator.next();
|
||||
if (document.details == null) {
|
||||
|
@ -45,7 +45,6 @@ public class LoaderMain extends ProcessMainClass {
|
||||
private final MessageQueueFactory messageQueueFactory;
|
||||
private final FileStorageService fileStorageService;
|
||||
private final DocumentDbWriter documentDbWriter;
|
||||
private final LoaderIndexJournalWriter journalWriter;
|
||||
private final DomainLoaderService domainService;
|
||||
private final DomainLinksLoaderService linksService;
|
||||
private final KeywordLoaderService keywordLoaderService;
|
||||
@ -79,7 +78,6 @@ public class LoaderMain extends ProcessMainClass {
|
||||
MessageQueueFactory messageQueueFactory,
|
||||
FileStorageService fileStorageService,
|
||||
DocumentDbWriter documentDbWriter,
|
||||
LoaderIndexJournalWriter journalWriter,
|
||||
DomainLoaderService domainService,
|
||||
DomainLinksLoaderService linksService,
|
||||
KeywordLoaderService keywordLoaderService,
|
||||
@ -92,7 +90,6 @@ public class LoaderMain extends ProcessMainClass {
|
||||
this.messageQueueFactory = messageQueueFactory;
|
||||
this.fileStorageService = fileStorageService;
|
||||
this.documentDbWriter = documentDbWriter;
|
||||
this.journalWriter = journalWriter;
|
||||
this.domainService = domainService;
|
||||
this.linksService = linksService;
|
||||
this.keywordLoaderService = keywordLoaderService;
|
||||
@ -132,7 +129,7 @@ public class LoaderMain extends ProcessMainClass {
|
||||
logger.error("Error", ex);
|
||||
}
|
||||
finally {
|
||||
journalWriter.close();
|
||||
keywordLoaderService.close();
|
||||
documentDbWriter.close();
|
||||
heartbeat.shutDown();
|
||||
}
|
||||
|
@ -11,7 +11,6 @@ import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.processed.DocumentRecordKeywordsProjection;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.roaringbitmap.RoaringBitmap;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -55,7 +54,7 @@ public class KeywordLoaderService {
|
||||
logger.info("Loading keywords from {}", file);
|
||||
|
||||
stream.filter(DocumentRecordKeywordsProjection::hasKeywords)
|
||||
.forEach(proj -> insertKeywords(domainIdRegistry, proj));
|
||||
.forEach(proj -> insertKeywords(domainIdRegistry, proj));
|
||||
}
|
||||
}
|
||||
|
||||
@ -78,4 +77,12 @@ public class KeywordLoaderService {
|
||||
projection.length,
|
||||
words);
|
||||
}
|
||||
|
||||
public void close() {
|
||||
try {
|
||||
writer.close();
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed to close writer", e);
|
||||
}
|
||||
}
|
||||
}
|
@ -10,6 +10,7 @@ import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.processed.DomainRecord;
|
||||
import nu.marginalia.model.processed.DomainWithIp;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeatImpl;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -108,7 +109,7 @@ public class DomainLoaderService {
|
||||
return domainNamesAll;
|
||||
}
|
||||
|
||||
public boolean loadDomainMetadata(DomainIdRegistry domainIdRegistry, ProcessHeartbeatImpl heartbeat, LoaderInputData inputData) {
|
||||
public boolean loadDomainMetadata(DomainIdRegistry domainIdRegistry, ProcessHeartbeat heartbeat, LoaderInputData inputData) {
|
||||
|
||||
var files = inputData.listDomainFiles();
|
||||
|
||||
|
47
code/tools/integration-test/build.gradle
Normal file
47
code/tools/integration-test/build.gradle
Normal file
@ -0,0 +1,47 @@
|
||||
plugins {
|
||||
id 'java'
|
||||
|
||||
|
||||
id 'jvm-test-suite'
|
||||
}
|
||||
|
||||
java {
|
||||
toolchain {
|
||||
languageVersion.set(JavaLanguageVersion.of(rootProject.ext.jvmVersion))
|
||||
}
|
||||
}
|
||||
|
||||
apply from: "$rootProject.projectDir/srcsets.gradle"
|
||||
|
||||
dependencies {
|
||||
implementation project(':code:processes:crawling-process')
|
||||
implementation project(':code:processes:converting-process')
|
||||
implementation project(':code:processes:loading-process')
|
||||
implementation project(':code:process-models:crawling-model')
|
||||
implementation project(':code:process-models:processed-data')
|
||||
implementation project(':code:processes:index-constructor-process')
|
||||
implementation project(':code:index')
|
||||
implementation project(':code:functions:search-query:api')
|
||||
implementation project(':code:index:index-reverse')
|
||||
implementation project(':code:index:index-forward')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:functions:link-graph:partition')
|
||||
implementation project(':code:libraries:array')
|
||||
implementation project(':code:common:db')
|
||||
implementation project(':code:common:config')
|
||||
implementation project(':code:common:linkdb')
|
||||
implementation project(':code:common:process')
|
||||
implementation project(':code:common:service')
|
||||
implementation project(':code:common:model')
|
||||
|
||||
implementation libs.bundles.slf4j
|
||||
implementation libs.bundles.grpc
|
||||
implementation libs.mockito
|
||||
implementation libs.notnull
|
||||
implementation libs.guice
|
||||
implementation libs.fastutil
|
||||
implementation libs.trove
|
||||
testImplementation libs.bundles.junit
|
||||
}
|
||||
|
@ -0,0 +1,316 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchQuery;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchSpecification;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.converting.processor.DomainProcessor;
|
||||
import nu.marginalia.converting.writer.ConverterBatchWriter;
|
||||
import nu.marginalia.crawl.retreival.DomainProber;
|
||||
import nu.marginalia.crawl.retreival.fetcher.ContentTags;
|
||||
import nu.marginalia.crawl.retreival.fetcher.warc.WarcRecorder;
|
||||
import nu.marginalia.crawling.io.CrawledDomainReader;
|
||||
import nu.marginalia.crawling.parquet.CrawledDocumentParquetRecordFileWriter;
|
||||
import nu.marginalia.index.IndexGrpcService;
|
||||
import nu.marginalia.index.ReverseIndexFullFileNames;
|
||||
import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.query.limit.QueryLimits;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimit;
|
||||
import nu.marginalia.index.searchset.SearchSetAny;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.loading.LoaderIndexJournalWriter;
|
||||
import nu.marginalia.loading.LoaderInputData;
|
||||
import nu.marginalia.loading.documents.DocumentLoaderService;
|
||||
import nu.marginalia.loading.documents.KeywordLoaderService;
|
||||
import nu.marginalia.loading.domains.DomainIdRegistry;
|
||||
import nu.marginalia.loading.links.DomainLinksLoaderService;
|
||||
import nu.marginalia.model.EdgeDomain;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.List;
|
||||
import java.util.function.LongPredicate;
|
||||
|
||||
import static nu.marginalia.index.journal.reader.IndexJournalReader.FILE_HEADER_SIZE_BYTES;
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class IntegrationTest {
|
||||
IntegrationTestModule testModule;
|
||||
@Inject
|
||||
DomainProcessor domainProcessor;
|
||||
|
||||
@Inject
|
||||
DomainLinksLoaderService linksService;
|
||||
@Inject
|
||||
KeywordLoaderService keywordLoaderService;
|
||||
@Inject
|
||||
DocumentLoaderService documentLoaderService;
|
||||
|
||||
@Inject
|
||||
FileStorageService fileStorageService;
|
||||
|
||||
@Inject
|
||||
DomainRankings domainRankings;
|
||||
|
||||
@Inject
|
||||
DocumentDbWriter documentDbWriter;
|
||||
@Inject
|
||||
LoaderIndexJournalWriter journalWriter;
|
||||
|
||||
Path warcData = null;
|
||||
Path crawlDataParquet = null;
|
||||
Path processedDataDir = null;
|
||||
|
||||
@Inject
|
||||
StatefulIndex statefulIndex;
|
||||
@Inject
|
||||
IndexGrpcService indexGrpcService;
|
||||
@Inject
|
||||
DocumentDbReader documentDbReader;
|
||||
|
||||
@BeforeEach
|
||||
public void setupTest() throws IOException {
|
||||
testModule = new IntegrationTestModule();
|
||||
|
||||
Guice.createInjector(testModule).injectMembers(this);
|
||||
|
||||
warcData = Files.createTempFile("warc", ".warc.gz");
|
||||
crawlDataParquet = Files.createTempFile("crawl", ".parquet");
|
||||
processedDataDir = Files.createTempDirectory("processed");
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDownTest() throws IOException {
|
||||
Files.deleteIfExists(warcData);
|
||||
Files.deleteIfExists(crawlDataParquet);
|
||||
TestUtil.clearTempDir(processedDataDir);
|
||||
|
||||
testModule.cleanUp();
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void run() throws Exception {
|
||||
|
||||
/** CREATE WARC */
|
||||
try (WarcRecorder warcRecorder = new WarcRecorder(warcData)) {
|
||||
warcRecorder.writeWarcinfoHeader("127.0.0.1", new EdgeDomain("www.example.com"),
|
||||
new DomainProber.ProbeResultOk(new EdgeUrl("https://www.example.com/")));
|
||||
|
||||
warcRecorder.writeReferenceCopy(new EdgeUrl("https://www.example.com/"),
|
||||
"text/html", 200,
|
||||
"""
|
||||
<html>
|
||||
<h1>Hello World</h1>
|
||||
<body>
|
||||
<p>The best description of my problem solving process is the Feynman algorithm, which is sometimes presented as a joke where the hidden subtext is “be smart”, but I disagree. The “algorithm” is a surprisingly lucid description of how thinking works in the context of hard problems where the answer can’t simply be looked up or trivially broken down, iterated upon in a bottom-up fashion, or approached with similar methods.
|
||||
The trick is that there is no trick. This is how thinking works. It appears that when you feed your brain related information, without further active involvement, it starts to digest the information you’ve fed it.
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
""",
|
||||
ContentTags.empty()
|
||||
);
|
||||
}
|
||||
|
||||
/** CONVERT WARC */
|
||||
CrawledDocumentParquetRecordFileWriter.convertWarc(
|
||||
"www.example.com",
|
||||
new UserAgent("search.marginalia.nu",
|
||||
"search.marginalia.nu"),
|
||||
warcData,
|
||||
crawlDataParquet);
|
||||
|
||||
/** PROCESS CRAWL DATA */
|
||||
|
||||
var processedDomain = domainProcessor.fullProcessing(CrawledDomainReader.createDataStream(crawlDataParquet));
|
||||
|
||||
System.out.println(processedDomain);
|
||||
|
||||
/** WRITE PROCESSED DATA */
|
||||
|
||||
try (ConverterBatchWriter cbw = new ConverterBatchWriter(processedDataDir, 0)) {
|
||||
cbw.writeProcessedDomain(processedDomain);
|
||||
|
||||
}
|
||||
// Write a single batch-switch marker in the process log so that the loader will read the data
|
||||
Files.writeString(processedDataDir.resolve("processor.log"), "F\n", StandardOpenOption.CREATE_NEW);
|
||||
|
||||
/** LOAD PROCESSED DATA */
|
||||
|
||||
LoaderInputData inputData = new LoaderInputData(List.of(processedDataDir));
|
||||
|
||||
DomainIdRegistry domainIdRegistry = Mockito.mock(DomainIdRegistry.class);
|
||||
when(domainIdRegistry.getDomainId(any())).thenReturn(1);
|
||||
|
||||
linksService.loadLinks(domainIdRegistry, new FakeProcessHeartbeat(), inputData);
|
||||
keywordLoaderService.loadKeywords(domainIdRegistry, new FakeProcessHeartbeat(), inputData);
|
||||
documentLoaderService.loadDocuments(domainIdRegistry, new FakeProcessHeartbeat(), inputData);
|
||||
|
||||
// These must be closed to finalize the associated files
|
||||
documentDbWriter.close();
|
||||
keywordLoaderService.close();
|
||||
|
||||
Path journalFile = fileStorageService
|
||||
.getStorageBase(FileStorageBaseType.CURRENT)
|
||||
.asPath()
|
||||
.resolve("iw/page-index-0000.dat");
|
||||
|
||||
assertTrue(Files.exists(journalFile), "Journal file not found: " + journalFile);
|
||||
assertTrue(Files.size(journalFile) > FILE_HEADER_SIZE_BYTES, "Journal file does not contain data");
|
||||
|
||||
/** CONSTRUCT INDEX */
|
||||
|
||||
createForwardIndex();
|
||||
createFullReverseIndex();
|
||||
createPrioReverseIndex();
|
||||
|
||||
/** SWITCH INDEX */
|
||||
|
||||
statefulIndex.switchIndex();
|
||||
|
||||
// Move the docdb file to the live location
|
||||
Files.move(
|
||||
IndexLocations.getLinkdbWritePath(fileStorageService).resolve(DOCDB_FILE_NAME),
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||
);
|
||||
// Reconnect the document reader to the new docdb file
|
||||
documentDbReader.reconnect();
|
||||
|
||||
/** QUERY */
|
||||
var rs = indexGrpcService.executeSearch(new SearchParameters(new SearchSpecification(
|
||||
new SearchQuery("problem solving process",
|
||||
List.of("problem", "solving", "process"),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(),
|
||||
List.of(new SearchCoherenceConstraint(true, List.of("problem", "solving", "process")))
|
||||
),
|
||||
null,
|
||||
"NONE",
|
||||
"feynman",
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
SpecificationLimit.none(),
|
||||
new QueryLimits(10, 10, 100, 100),
|
||||
QueryStrategy.AUTO,
|
||||
ResultRankingParameters.sensibleDefaults()
|
||||
), new SearchSetAny()));
|
||||
|
||||
|
||||
System.out.println(rs);
|
||||
}
|
||||
|
||||
|
||||
private void createFullReverseIndex() throws IOException {
|
||||
|
||||
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
IndexJournalReader::singleFile,
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
|
||||
constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexFull", workDir);
|
||||
|
||||
}
|
||||
|
||||
private void createPrioReverseIndex() throws IOException {
|
||||
|
||||
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path outputFilePositions = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.POSITIONS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
// The priority index only includes words that have bits indicating they are
|
||||
// important to the document. This filter will act on the encoded {@see WordMetadata}
|
||||
LongPredicate wordMetaFilter = getPriorityIndexWordMetaFilter();
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
(path) -> IndexJournalReader.singleFile(path).filtering(wordMetaFilter),
|
||||
this::addRankToIdEncoding,
|
||||
tmpDir);
|
||||
|
||||
constructor.createReverseIndex(new FakeProcessHeartbeat(), "createReverseIndexPrio", workDir);
|
||||
}
|
||||
|
||||
private static LongPredicate getPriorityIndexWordMetaFilter() {
|
||||
|
||||
long highPriorityFlags =
|
||||
WordFlags.Title.asBit()
|
||||
| WordFlags.Subjects.asBit()
|
||||
| WordFlags.TfIdfHigh.asBit()
|
||||
| WordFlags.NamesWords.asBit()
|
||||
| WordFlags.UrlDomain.asBit()
|
||||
| WordFlags.UrlPath.asBit()
|
||||
| WordFlags.Site.asBit()
|
||||
| WordFlags.ExternalLink.asBit()
|
||||
| WordFlags.SiteAdjacent.asBit();
|
||||
|
||||
return r -> WordMetadata.hasAnyFlags(r, highPriorityFlags);
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws IOException {
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
|
||||
ForwardIndexConverter converter = new ForwardIndexConverter(new FakeProcessHeartbeat(),
|
||||
IndexJournalReader.paging(workDir),
|
||||
outputFileDocsId,
|
||||
outputFileDocsData,
|
||||
domainRankings
|
||||
);
|
||||
|
||||
converter.convert();
|
||||
}
|
||||
|
||||
private long addRankToIdEncoding(long docId) {
|
||||
return UrlIdCodec.addRank(
|
||||
255,
|
||||
docId);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,161 @@
|
||||
package nu.marginalia;
|
||||
|
||||
import com.google.inject.AbstractModule;
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
import com.google.inject.name.Names;
|
||||
import gnu.trove.list.array.TIntArrayList;
|
||||
import nu.marginalia.db.DomainTypes;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||
import nu.marginalia.index.searchset.SearchSetAny;
|
||||
import nu.marginalia.index.searchset.SearchSetsService;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.linkgraph.io.DomainLinksWriter;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.service.ServiceId;
|
||||
import nu.marginalia.service.control.FakeServiceHeartbeat;
|
||||
import nu.marginalia.service.control.ServiceEventLog;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.module.ServiceConfiguration;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.storage.model.FileStorageBase;
|
||||
import nu.marginalia.storage.model.FileStorageBaseType;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Random;
|
||||
import java.util.UUID;
|
||||
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOMAIN_LINKS_FILE_NAME;
|
||||
import static org.mockito.ArgumentMatchers.any;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
public class IntegrationTestModule extends AbstractModule {
|
||||
Path workDir;
|
||||
Path slowDir;
|
||||
Path fastDir;
|
||||
Path indexDir;
|
||||
|
||||
Random random = new Random();
|
||||
|
||||
public IntegrationTestModule() throws IOException {
|
||||
workDir = Files.createTempDirectory("IntegrationTest");
|
||||
slowDir = workDir.resolve("slow");
|
||||
fastDir = workDir.resolve("fast");
|
||||
indexDir = workDir.resolve("index");
|
||||
|
||||
Files.createDirectory(slowDir);
|
||||
Files.createDirectory(fastDir);
|
||||
}
|
||||
|
||||
public void cleanUp() {
|
||||
TestUtil.clearTempDir(workDir);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void configure() {
|
||||
|
||||
try {
|
||||
var fileStorageServiceMock = Mockito.mock(FileStorageService.class);
|
||||
Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.WORK))
|
||||
.thenReturn(new FileStorageBase(null, null, 0,null, slowDir.toString()));
|
||||
Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.CURRENT))
|
||||
.thenReturn(new FileStorageBase(null, null, 0,null, fastDir.toString()));
|
||||
Mockito.when(fileStorageServiceMock.getStorageBase(FileStorageBaseType.STORAGE))
|
||||
.thenReturn(new FileStorageBase(null, null, 0, null, fastDir.toString()));
|
||||
|
||||
bind(DocumentDbReader.class).toInstance(new DocumentDbReader(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageServiceMock)
|
||||
.resolve(DOCDB_FILE_NAME)
|
||||
));
|
||||
|
||||
bind(FileStorageService.class).toInstance(fileStorageServiceMock);
|
||||
bind(ServiceHeartbeat.class).toInstance(new FakeServiceHeartbeat());
|
||||
bind(ProcessHeartbeat.class).toInstance(new FakeProcessHeartbeat());
|
||||
|
||||
SearchSetsService setsServiceMock = Mockito.mock(SearchSetsService.class);
|
||||
when(setsServiceMock.getSearchSetByName("NONE")).thenReturn(new SearchSetAny());
|
||||
when(setsServiceMock.getDomainRankings()).thenReturn(new DomainRankings());
|
||||
bind(SearchSetsService.class).toInstance(setsServiceMock);
|
||||
|
||||
DomainTypes domainTypes = Mockito.mock(DomainTypes.class);
|
||||
when(domainTypes.getAllDomainsByType(any())).thenReturn(new ArrayList<>());
|
||||
when(domainTypes.getKnownDomainsByType(any())).thenReturn(new TIntArrayList());
|
||||
when(domainTypes.downloadList(any())).thenReturn(new ArrayList<>());
|
||||
bind(DomainTypes.class).toInstance(domainTypes);
|
||||
|
||||
bind(ServiceEventLog.class).toInstance(Mockito.mock(ServiceEventLog.class));
|
||||
|
||||
bind(IndexJournalWriter.class).toInstance(new IndexJournalWriterPagingImpl(
|
||||
IndexLocations.getIndexConstructionArea(fileStorageServiceMock)
|
||||
));
|
||||
|
||||
bind(ServiceConfiguration.class).toInstance(new ServiceConfiguration(
|
||||
ServiceId.Index,
|
||||
0,
|
||||
"127.0.0.1",
|
||||
"127.0.0.1",
|
||||
randomPort(),
|
||||
UUID.randomUUID()
|
||||
));
|
||||
|
||||
bind(ProcessConfiguration.class).toInstance(new ProcessConfiguration(
|
||||
"TEST",
|
||||
0,
|
||||
UUID.randomUUID()));
|
||||
|
||||
bind(Double.class).annotatedWith(Names.named("min-document-quality")).toInstance(-15.);
|
||||
bind(Integer.class).annotatedWith(Names.named("min-document-length")).toInstance(32);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-title-length")).toInstance(128);
|
||||
bind(Integer.class).annotatedWith(Names.named("max-summary-length")).toInstance(255);
|
||||
|
||||
bind(Path.class).annotatedWith(Names.named("local-index-path")).toInstance(indexDir);
|
||||
|
||||
bind(LanguageModels.class).toInstance(WmsaHome.getLanguageModels());
|
||||
|
||||
} catch (IOException | SQLException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
@Inject
|
||||
@Provides
|
||||
@Singleton
|
||||
private DocumentDbWriter createLinkdbWriter(FileStorageService service) throws SQLException, IOException {
|
||||
// Migrate
|
||||
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOCDB_FILE_NAME);
|
||||
|
||||
if (Files.exists(dbPath)) {
|
||||
Files.delete(dbPath);
|
||||
}
|
||||
return new DocumentDbWriter(dbPath);
|
||||
}
|
||||
|
||||
@Inject @Provides @Singleton
|
||||
private DomainLinksWriter createDomainLinkdbWriter(FileStorageService service) throws SQLException, IOException {
|
||||
|
||||
Path dbPath = IndexLocations.getLinkdbWritePath(service).resolve(DOMAIN_LINKS_FILE_NAME);
|
||||
|
||||
if (Files.exists(dbPath)) {
|
||||
Files.delete(dbPath);
|
||||
}
|
||||
|
||||
return new DomainLinksWriter(dbPath);
|
||||
}
|
||||
private int randomPort() {
|
||||
return random.nextInt(10000, 30000);
|
||||
}
|
||||
}
|
52
code/tools/integration-test/test/nu/marginalia/TestUtil.java
Normal file
52
code/tools/integration-test/test/nu/marginalia/TestUtil.java
Normal file
@ -0,0 +1,52 @@
|
||||
package nu.marginalia;
|
||||
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
|
||||
public class TestUtil {
|
||||
public static void clearTempDir(Path path) {
|
||||
if (!Files.exists(path))
|
||||
return;
|
||||
|
||||
if (Files.isDirectory(path)) {
|
||||
for (File f : path.toFile().listFiles()) {
|
||||
if (f.isDirectory()) {
|
||||
File[] files = f.listFiles();
|
||||
if (files != null) {
|
||||
Arrays.stream(files).map(File::toPath).forEach(TestUtil::clearTempDir);
|
||||
}
|
||||
}
|
||||
else {
|
||||
System.out.println("Deleting " + f + " (" + fileSize(f.toPath()) + ")");
|
||||
f.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
System.out.println("Deleting " + path + " (" + fileSize(path) + ")");
|
||||
}
|
||||
path.toFile().delete();
|
||||
}
|
||||
|
||||
private static String fileSize(Path path) {
|
||||
try {
|
||||
long sizeBytes = Files.size(path);
|
||||
|
||||
if (sizeBytes > 1024 * 1024 * 1024) return round(sizeBytes / 1073741824.) + "Gb";
|
||||
if (sizeBytes > 1024 * 1024) return round(sizeBytes / 1048576.) + "Mb";
|
||||
if (sizeBytes > 1024) return round(sizeBytes / 1024.) + "Kb";
|
||||
return sizeBytes + "b";
|
||||
}
|
||||
catch (IOException ex) {
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
}
|
||||
|
||||
private static String round(double d) {
|
||||
return String.format("%.2f", d);
|
||||
}
|
||||
}
|
@ -95,6 +95,7 @@ include 'code:process-models:processed-data'
|
||||
include 'code:tools:experiment-runner'
|
||||
include 'code:tools:screenshot-capture-tool'
|
||||
include 'code:tools:load-test'
|
||||
include 'code:tools:integration-test'
|
||||
|
||||
include 'third-party:porterstemmer'
|
||||
include 'third-party:symspell'
|
||||
|
Loading…
Reference in New Issue
Block a user