mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index) Move to a lexicon-free index design
This is a system-wide change. The index used to have a lexicon, mapping words to wordIds using a large in-memory hash table. This made index-construction easier, but it also added a fairly significant RAM penalty to both the index service and the loader. The new design moves to 64 bit word identifiers calculated using the murmur hash of the keyword, and an index construction based on merging smaller indices. It also became necessary half-way through to upgrade guice as its error reporting wasn't *quite* compatible with JDK20.
This commit is contained in:
parent
4e694fdff6
commit
3101b74580
@ -7,8 +7,8 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
@ -5,8 +5,8 @@ import nu.marginalia.db.storage.model.*;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -5,11 +5,9 @@ public enum FileStorageType {
|
|||||||
CRAWL_DATA,
|
CRAWL_DATA,
|
||||||
PROCESSED_DATA,
|
PROCESSED_DATA,
|
||||||
INDEX_STAGING,
|
INDEX_STAGING,
|
||||||
LEXICON_STAGING,
|
|
||||||
LINKDB_STAGING,
|
LINKDB_STAGING,
|
||||||
LINKDB_LIVE,
|
LINKDB_LIVE,
|
||||||
INDEX_LIVE,
|
INDEX_LIVE,
|
||||||
LEXICON_LIVE,
|
|
||||||
BACKUP,
|
BACKUP,
|
||||||
EXPORT,
|
EXPORT,
|
||||||
SEARCH_SETS
|
SEARCH_SETS
|
||||||
|
@ -0,0 +1 @@
|
|||||||
|
DELETE FROM FILE_STORAGE WHERE TYPE IN ('LEXICON_STAGING', 'LEXICON_LIVE');
|
@ -7,7 +7,7 @@ import nu.marginalia.language.model.WordRep;
|
|||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.ranking;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
||||||
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
import it.unimi.dsi.fastutil.ints.Int2ShortOpenHashMap;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -37,7 +38,8 @@ public class DomainRankings {
|
|||||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
public float getSortRanking(int domainId) {
|
public float getSortRanking(long docId) {
|
||||||
|
int domainId = UrlIdCodec.getDomainId(docId);
|
||||||
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE;
|
return rankings.getOrDefault(domainId, (short) MAX_RANK_VALUE) / (float) MAX_RANK_VALUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -16,7 +16,6 @@ dependencies {
|
|||||||
implementation project(':code:features-index:domain-ranking')
|
implementation project(':code:features-index:domain-ranking')
|
||||||
implementation project(':code:features-index:index-query')
|
implementation project(':code:features-index:index-query')
|
||||||
implementation project(':code:features-index:index-journal')
|
implementation project(':code:features-index:index-journal')
|
||||||
implementation project(':code:features-index:lexicon')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:process')
|
implementation project(':code:common:process')
|
||||||
|
|
||||||
|
@ -21,23 +21,23 @@ import java.nio.file.Path;
|
|||||||
public class ForwardIndexConverter {
|
public class ForwardIndexConverter {
|
||||||
|
|
||||||
private final ProcessHeartbeat heartbeat;
|
private final ProcessHeartbeat heartbeat;
|
||||||
private final File inputFile;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
private final IndexJournalReader journalReader;
|
||||||
private final Path outputFileDocsId;
|
private final Path outputFileDocsId;
|
||||||
private final Path outputFileDocsData;
|
private final Path outputFileDocsData;
|
||||||
private final DomainRankings domainRankings;
|
private final DomainRankings domainRankings;
|
||||||
|
|
||||||
|
|
||||||
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
|
public ForwardIndexConverter(ProcessHeartbeat heartbeat,
|
||||||
File inputFile,
|
IndexJournalReader journalReader,
|
||||||
Path outputFileDocsId,
|
Path outputFileDocsId,
|
||||||
Path outputFileDocsData,
|
Path outputFileDocsData,
|
||||||
DomainRankings domainRankings
|
DomainRankings domainRankings
|
||||||
) {
|
) {
|
||||||
this.heartbeat = heartbeat;
|
this.heartbeat = heartbeat;
|
||||||
this.inputFile = inputFile;
|
this.journalReader = journalReader;
|
||||||
this.outputFileDocsId = outputFileDocsId;
|
this.outputFileDocsId = outputFileDocsId;
|
||||||
this.outputFileDocsData = outputFileDocsData;
|
this.outputFileDocsData = outputFileDocsData;
|
||||||
this.domainRankings = domainRankings;
|
this.domainRankings = domainRankings;
|
||||||
@ -54,14 +54,6 @@ public class ForwardIndexConverter {
|
|||||||
public void convert() throws IOException {
|
public void convert() throws IOException {
|
||||||
deleteOldFiles();
|
deleteOldFiles();
|
||||||
|
|
||||||
IndexJournalReaderSingleCompressedFile journalReader = new IndexJournalReaderSingleCompressedFile(inputFile.toPath());
|
|
||||||
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
|
||||||
logger.warn("Bailing: Journal is empty!");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Converting {} {}", inputFile, journalReader.fileHeader);
|
|
||||||
|
|
||||||
logger.info("Domain Rankings size = {}", domainRankings.size());
|
logger.info("Domain Rankings size = {}", domainRankings.size());
|
||||||
|
|
||||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
|
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "forwardIndexConverter")) {
|
||||||
|
@ -2,15 +2,13 @@ package nu.marginalia.index.forward;
|
|||||||
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.process.control.ProcessTaskHeartbeat;
|
import nu.marginalia.process.control.ProcessTaskHeartbeat;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.test.TestUtil;
|
import nu.marginalia.test.TestUtil;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
@ -29,7 +27,6 @@ import static org.mockito.Mockito.when;
|
|||||||
|
|
||||||
class ForwardIndexConverterTest {
|
class ForwardIndexConverterTest {
|
||||||
|
|
||||||
KeywordLexicon keywordLexicon;
|
|
||||||
IndexJournalWriter writer;
|
IndexJournalWriter writer;
|
||||||
|
|
||||||
Path indexFile;
|
Path indexFile;
|
||||||
@ -50,12 +47,9 @@ class ForwardIndexConverterTest {
|
|||||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
||||||
dictionaryFile.toFile().deleteOnExit();
|
dictionaryFile.toFile().deleteOnExit();
|
||||||
|
|
||||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE));
|
|
||||||
keywordLexicon.getOrInsert("0");
|
|
||||||
|
|
||||||
indexFile = Files.createTempFile("tmp", ".idx");
|
indexFile = Files.createTempFile("tmp", ".idx");
|
||||||
indexFile.toFile().deleteOnExit();
|
indexFile.toFile().deleteOnExit();
|
||||||
writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
|
writer = new IndexJournalWriterSingleFileImpl(indexFile);
|
||||||
|
|
||||||
wordsFile1 = Files.createTempFile("words1", ".idx");
|
wordsFile1 = Files.createTempFile("words1", ".idx");
|
||||||
urlsFile1 = Files.createTempFile("urls1", ".idx");
|
urlsFile1 = Files.createTempFile("urls1", ".idx");
|
||||||
@ -63,11 +57,9 @@ class ForwardIndexConverterTest {
|
|||||||
dataDir = Files.createTempDirectory(getClass().getSimpleName());
|
dataDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||||
|
|
||||||
for (int i = 1; i < workSetSize; i++) {
|
for (int i = 1; i < workSetSize; i++) {
|
||||||
createEntry(writer, keywordLexicon, i);
|
createEntry(writer, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
keywordLexicon.commitToDisk();
|
|
||||||
writer.close();
|
writer.close();
|
||||||
|
|
||||||
|
|
||||||
@ -88,13 +80,13 @@ class ForwardIndexConverterTest {
|
|||||||
return UrlIdCodec.encodeId((int) domain, (int) url);
|
return UrlIdCodec.encodeId((int) domain, (int) url);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
public void createEntry(IndexJournalWriter writer, int id) {
|
||||||
int[] factors = getFactorsI(id);
|
int[] factors = getFactorsI(id);
|
||||||
|
|
||||||
var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5);
|
var entryBuilder = IndexJournalEntry.builder(createId(id, id/20), id%5);
|
||||||
|
|
||||||
for (int i = 0; i+1 < factors.length; i+=2) {
|
for (int i = 0; i+1 < factors.length; i+=2) {
|
||||||
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i+1]);
|
entryBuilder.add(factors[i], -factors[i+1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.put(entryBuilder.build());
|
writer.put(entryBuilder.build());
|
||||||
@ -108,7 +100,7 @@ class ForwardIndexConverterTest {
|
|||||||
when(serviceHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
when(serviceHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
||||||
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
||||||
|
|
||||||
new ForwardIndexConverter(serviceHeartbeat, indexFile.toFile(), docsFileId, docsFileData, new DomainRankings()).convert();
|
new ForwardIndexConverter(serviceHeartbeat, new IndexJournalReaderSingleCompressedFile(indexFile), docsFileId, docsFileData, new DomainRankings()).convert();
|
||||||
|
|
||||||
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
var forwardReader = new ForwardIndexReader(docsFileId, docsFileData);
|
||||||
|
|
||||||
|
@ -13,7 +13,6 @@ java {
|
|||||||
dependencies {
|
dependencies {
|
||||||
implementation project(':code:libraries:array')
|
implementation project(':code:libraries:array')
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:features-index:lexicon')
|
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
@ -22,6 +21,7 @@ dependencies {
|
|||||||
implementation libs.prometheus
|
implementation libs.prometheus
|
||||||
implementation libs.notnull
|
implementation libs.notnull
|
||||||
implementation libs.rxjava
|
implementation libs.rxjava
|
||||||
|
implementation libs.guava
|
||||||
implementation libs.trove
|
implementation libs.trove
|
||||||
implementation libs.zstd
|
implementation libs.zstd
|
||||||
implementation libs.commons.lang3
|
implementation libs.commons.lang3
|
||||||
|
@ -58,9 +58,9 @@ public class IndexJournalEntryData implements Iterable<IndexJournalEntryData.Rec
|
|||||||
public Record next() {
|
public Record next() {
|
||||||
pos+=ENTRY_SIZE;
|
pos+=ENTRY_SIZE;
|
||||||
|
|
||||||
return new Record((int) underlyingArray[pos], underlyingArray[pos+1]);
|
return new Record(underlyingArray[pos], underlyingArray[pos+1]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public record Record(int wordId, long metadata) {}
|
public record Record(long wordId, long metadata) {}
|
||||||
}
|
}
|
||||||
|
@ -8,6 +8,7 @@ import java.io.DataInputStream;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.LongBuffer;
|
import java.nio.LongBuffer;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
public class IndexJournalReadEntry {
|
public class IndexJournalReadEntry {
|
||||||
public final IndexJournalEntryHeader header;
|
public final IndexJournalEntryHeader header;
|
||||||
|
@ -3,25 +3,33 @@ package nu.marginalia.index.journal.reader;
|
|||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
|
import nu.marginalia.index.journal.model.IndexJournalFileHeader;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import org.jetbrains.annotations.NotNull;
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
import java.util.function.LongConsumer;
|
import java.util.function.LongConsumer;
|
||||||
|
import java.util.function.Predicate;
|
||||||
|
|
||||||
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
||||||
int FILE_HEADER_SIZE_LONGS = 2;
|
int FILE_HEADER_SIZE_LONGS = 2;
|
||||||
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
||||||
|
|
||||||
IndexJournalFileHeader fileHeader();
|
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
||||||
|
return new IndexJournalReaderSingleCompressedFile(fileName);
|
||||||
|
}
|
||||||
|
static IndexJournalReader paging(Path baseDir) throws IOException {
|
||||||
|
return new IndexJournalReaderPagingImpl(baseDir);
|
||||||
|
}
|
||||||
|
|
||||||
IndexJournalStatistics getStatistics();
|
static IndexJournalReader withFilters(Path path, Predicate<IndexJournalReadEntry> entryPredicate, Predicate<IndexJournalEntryData.Record> recordPredicate) throws IOException {
|
||||||
|
return new IndexJournalReaderSingleCompressedFile(path, entryPredicate, recordPredicate);
|
||||||
void forEachWordId(IntConsumer consumer);
|
}
|
||||||
|
|
||||||
|
|
||||||
void forEachDocIdWordId(LongIntConsumer consumer);
|
void forEachWordId(LongConsumer consumer);
|
||||||
|
|
||||||
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
|
void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer);
|
||||||
|
|
||||||
@ -33,13 +41,23 @@ public interface IndexJournalReader extends Iterable<IndexJournalReadEntry> {
|
|||||||
|
|
||||||
void close() throws IOException;
|
void close() throws IOException;
|
||||||
|
|
||||||
interface BiIntConsumer {
|
static IndexJournalReader singleFileWithPriorityFilters(Path path) throws IOException {
|
||||||
void accept(int left, int right);
|
|
||||||
|
long highPriorityFlags =
|
||||||
|
WordFlags.Title.asBit()
|
||||||
|
| WordFlags.Subjects.asBit()
|
||||||
|
| WordFlags.TfIdfHigh.asBit()
|
||||||
|
| WordFlags.NamesWords.asBit()
|
||||||
|
| WordFlags.UrlDomain.asBit()
|
||||||
|
| WordFlags.UrlPath.asBit()
|
||||||
|
| WordFlags.Site.asBit()
|
||||||
|
| WordFlags.SiteAdjacent.asBit();
|
||||||
|
|
||||||
|
return new IndexJournalReaderSingleCompressedFile(path, null,
|
||||||
|
r -> (r.metadata() & highPriorityFlags) != 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
interface LongIntConsumer {
|
|
||||||
void accept(long left, int right);
|
|
||||||
}
|
|
||||||
|
|
||||||
interface LongObjectConsumer<T> {
|
interface LongObjectConsumer<T> {
|
||||||
void accept(long left, T right);
|
void accept(long left, T right);
|
||||||
|
@ -0,0 +1,61 @@
|
|||||||
|
package nu.marginalia.index.journal.reader;
|
||||||
|
|
||||||
|
import com.google.common.collect.Iterators;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
||||||
|
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Iterator;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.function.LongConsumer;
|
||||||
|
|
||||||
|
public class IndexJournalReaderPagingImpl implements IndexJournalReader {
|
||||||
|
|
||||||
|
private final List<IndexJournalReader> readers;
|
||||||
|
|
||||||
|
public IndexJournalReaderPagingImpl(Path baseDir) throws IOException {
|
||||||
|
var inputFiles = IndexJournalFileNames.findJournalFiles(baseDir);
|
||||||
|
this.readers = new ArrayList<>(inputFiles.size());
|
||||||
|
|
||||||
|
for (var inputFile : inputFiles) {
|
||||||
|
readers.add(new IndexJournalReaderSingleCompressedFile(inputFile));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void forEachWordId(LongConsumer consumer) {
|
||||||
|
for (var reader : readers) {
|
||||||
|
reader.forEachWordId(consumer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
|
||||||
|
for (var reader : readers) {
|
||||||
|
reader.forEachDocIdRecord(consumer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void forEachDocId(LongConsumer consumer) {
|
||||||
|
for (var reader : readers) {
|
||||||
|
reader.forEachDocId(consumer);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public @NotNull Iterator<IndexJournalReadEntry> iterator() {
|
||||||
|
return Iterators.concat(readers.stream().map(IndexJournalReader::iterator).iterator());
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
for (var reader : readers) {
|
||||||
|
reader.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -12,6 +12,7 @@ import java.io.*;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.function.IntConsumer;
|
import java.util.function.IntConsumer;
|
||||||
import java.util.function.LongConsumer;
|
import java.util.function.LongConsumer;
|
||||||
@ -19,15 +20,22 @@ import java.util.function.Predicate;
|
|||||||
|
|
||||||
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
|
public class IndexJournalReaderSingleCompressedFile implements IndexJournalReader {
|
||||||
|
|
||||||
private static Path journalFile;
|
private Path journalFile;
|
||||||
public final IndexJournalFileHeader fileHeader;
|
public final IndexJournalFileHeader fileHeader;
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "IndexJournalReaderSingleCompressedFile{" + journalFile + " }";
|
||||||
|
}
|
||||||
|
|
||||||
private DataInputStream dataInputStream = null;
|
private DataInputStream dataInputStream = null;
|
||||||
|
|
||||||
final Predicate<IndexJournalReadEntry> entryPredicate;
|
final Predicate<IndexJournalReadEntry> entryPredicate;
|
||||||
final Predicate<IndexJournalEntryData.Record> recordPredicate;
|
final Predicate<IndexJournalEntryData.Record> recordPredicate;
|
||||||
|
|
||||||
public IndexJournalReaderSingleCompressedFile(Path file) throws IOException {
|
public IndexJournalReaderSingleCompressedFile(Path file) throws IOException {
|
||||||
|
this.journalFile = file;
|
||||||
|
|
||||||
fileHeader = readHeader(file);
|
fileHeader = readHeader(file);
|
||||||
|
|
||||||
this.recordPredicate = null;
|
this.recordPredicate = null;
|
||||||
@ -35,7 +43,8 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
}
|
}
|
||||||
|
|
||||||
public IndexJournalReaderSingleCompressedFile(Path file, Predicate<IndexJournalReadEntry> entryPredicate, Predicate<IndexJournalEntryData.Record> recordPredicate) throws IOException {
|
public IndexJournalReaderSingleCompressedFile(Path file, Predicate<IndexJournalReadEntry> entryPredicate, Predicate<IndexJournalEntryData.Record> recordPredicate) throws IOException {
|
||||||
journalFile = file;
|
this.journalFile = file;
|
||||||
|
|
||||||
fileHeader = readHeader(file);
|
fileHeader = readHeader(file);
|
||||||
|
|
||||||
this.recordPredicate = recordPredicate;
|
this.recordPredicate = recordPredicate;
|
||||||
@ -43,8 +52,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
}
|
}
|
||||||
|
|
||||||
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
|
private static IndexJournalFileHeader readHeader(Path file) throws IOException {
|
||||||
journalFile = file;
|
|
||||||
|
|
||||||
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
|
try (var raf = new RandomAccessFile(file.toFile(), "r")) {
|
||||||
long unused = raf.readLong();
|
long unused = raf.readLong();
|
||||||
long wordCount = raf.readLong();
|
long wordCount = raf.readLong();
|
||||||
@ -62,10 +69,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream)));
|
return new DataInputStream(new ZstdInputStream(new BufferedInputStream(fileInputStream)));
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexJournalFileHeader fileHeader() {
|
|
||||||
return fileHeader;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean filter(IndexJournalReadEntry entry) {
|
public boolean filter(IndexJournalReadEntry entry) {
|
||||||
return entryPredicate == null || entryPredicate.test(entry);
|
return entryPredicate == null || entryPredicate.test(entry);
|
||||||
}
|
}
|
||||||
@ -81,31 +84,7 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public IndexJournalStatistics getStatistics() {
|
public void forEachWordId(LongConsumer consumer) {
|
||||||
int highestWord = 0;
|
|
||||||
|
|
||||||
// Docs cardinality is a candidate for a HyperLogLog
|
|
||||||
Roaring64Bitmap docsBitmap = new Roaring64Bitmap();
|
|
||||||
|
|
||||||
for (var entry : this) {
|
|
||||||
var entryData = entry.readEntry();
|
|
||||||
|
|
||||||
if (filter(entry)) {
|
|
||||||
docsBitmap.addLong(entry.docId() & 0x0000_0000_FFFF_FFFFL);
|
|
||||||
|
|
||||||
for (var item : entryData) {
|
|
||||||
if (filter(entry, item)) {
|
|
||||||
highestWord = Integer.max(item.wordId(), highestWord);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return new IndexJournalStatistics(highestWord, docsBitmap.getIntCardinality());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void forEachWordId(IntConsumer consumer) {
|
|
||||||
for (var entry : this) {
|
for (var entry : this) {
|
||||||
var data = entry.readEntry();
|
var data = entry.readEntry();
|
||||||
for (var post : data) {
|
for (var post : data) {
|
||||||
@ -116,19 +95,6 @@ public class IndexJournalReaderSingleCompressedFile implements IndexJournalReade
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
|
||||||
public void forEachDocIdWordId(LongIntConsumer consumer) {
|
|
||||||
for (var entry : this) {
|
|
||||||
var data = entry.readEntry();
|
|
||||||
|
|
||||||
for (var post : data) {
|
|
||||||
if (filter(entry, post)) {
|
|
||||||
consumer.accept(entry.docId(), post.wordId());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
|
public void forEachDocIdRecord(LongObjectConsumer<IndexJournalEntryData.Record> consumer) {
|
||||||
for (var entry : this) {
|
for (var entry : this) {
|
||||||
|
@ -0,0 +1,48 @@
|
|||||||
|
package nu.marginalia.index.journal.writer;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
|
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public class IndexJournalWriterPagingImpl implements IndexJournalWriter {
|
||||||
|
private final Path outputDir;
|
||||||
|
private int fileNumber = 0;
|
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private IndexJournalWriter currentWriter = null;
|
||||||
|
private int inputsForFile = 0;
|
||||||
|
|
||||||
|
public IndexJournalWriterPagingImpl(Path outputDir) throws IOException {
|
||||||
|
this.outputDir = outputDir;
|
||||||
|
switchToNextWriter();
|
||||||
|
|
||||||
|
logger.info("Creating Journal Writer {}", outputDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void switchToNextWriter() throws IOException {
|
||||||
|
if (currentWriter != null)
|
||||||
|
currentWriter.close();
|
||||||
|
|
||||||
|
currentWriter = new IndexJournalWriterSingleFileImpl(IndexJournalFileNames.allocateName(outputDir, fileNumber++));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
@SneakyThrows
|
||||||
|
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||||
|
if (++inputsForFile > 100_000) {
|
||||||
|
inputsForFile = 0;
|
||||||
|
switchToNextWriter();
|
||||||
|
}
|
||||||
|
currentWriter.put(header, entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void close() throws IOException {
|
||||||
|
currentWriter.close();
|
||||||
|
}
|
||||||
|
}
|
@ -1,12 +1,11 @@
|
|||||||
package nu.marginalia.index.journal.writer;
|
package nu.marginalia.index.journal.writer;
|
||||||
|
|
||||||
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
|
import com.github.luben.zstd.ZstdDirectBufferCompressingStream;
|
||||||
import com.github.luben.zstd.ZstdOutputStream;
|
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -16,27 +15,34 @@ import java.nio.channels.FileChannel;
|
|||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardOpenOption;
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.nio.file.attribute.PosixFilePermissions;
|
||||||
|
|
||||||
public class IndexJournalWriterImpl implements IndexJournalWriter{
|
public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||||
private final KeywordLexicon lexicon;
|
|
||||||
|
|
||||||
private static final int ZSTD_BUFFER_SIZE = 8192;
|
private static final int ZSTD_BUFFER_SIZE = 8192;
|
||||||
private static final int DATA_BUFFER_SIZE = 8192;
|
private static final int DATA_BUFFER_SIZE = 8192;
|
||||||
|
|
||||||
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
|
private final ByteBuffer dataBuffer = ByteBuffer.allocateDirect(DATA_BUFFER_SIZE);
|
||||||
|
|
||||||
|
|
||||||
private final ZstdDirectBufferCompressingStream compressingStream;
|
private final ZstdDirectBufferCompressingStream compressingStream;
|
||||||
private int numEntries = 0;
|
|
||||||
private final FileChannel fileChannel;
|
private final FileChannel fileChannel;
|
||||||
|
|
||||||
public IndexJournalWriterImpl(KeywordLexicon lexicon, Path outputFile) throws IOException {
|
private int numEntries = 0;
|
||||||
this.lexicon = lexicon;
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
|
public IndexJournalWriterSingleFileImpl(Path outputFile) throws IOException {
|
||||||
|
|
||||||
|
logger.info("Creating Journal Writer {}", outputFile);
|
||||||
|
|
||||||
|
Files.deleteIfExists(outputFile);
|
||||||
|
Files.createFile(outputFile, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
||||||
|
|
||||||
fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE,
|
fileChannel = FileChannel.open(outputFile, StandardOpenOption.CREATE,
|
||||||
StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
|
StandardOpenOption.WRITE, StandardOpenOption.TRUNCATE_EXISTING);
|
||||||
|
|
||||||
writeHeaderPlaceholder(fileChannel);
|
writeHeaderPlaceholder(fileChannel);
|
||||||
|
|
||||||
compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) {
|
compressingStream = new ZstdDirectBufferCompressingStream(ByteBuffer.allocateDirect(ZSTD_BUFFER_SIZE), 3) {
|
||||||
protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException {
|
protected ByteBuffer flushBuffer(ByteBuffer toFlush) throws IOException {
|
||||||
toFlush.flip();
|
toFlush.flip();
|
||||||
@ -64,7 +70,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
|||||||
|
|
||||||
@Override
|
@Override
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public synchronized void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
public void put(IndexJournalEntryHeader header, IndexJournalEntryData entry) {
|
||||||
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
||||||
dataBuffer.flip();
|
dataBuffer.flip();
|
||||||
compressingStream.compress(dataBuffer);
|
compressingStream.compress(dataBuffer);
|
||||||
@ -84,6 +90,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
|||||||
dataBuffer.clear();
|
dataBuffer.clear();
|
||||||
}
|
}
|
||||||
else while (remaining-- > 0 && i < entry.size()) {
|
else while (remaining-- > 0 && i < entry.size()) {
|
||||||
|
|
||||||
dataBuffer.putLong(entry.underlyingArray[i++]);
|
dataBuffer.putLong(entry.underlyingArray[i++]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -103,7 +110,7 @@ public class IndexJournalWriterImpl implements IndexJournalWriter{
|
|||||||
|
|
||||||
ByteBuffer header = ByteBuffer.allocate(16);
|
ByteBuffer header = ByteBuffer.allocate(16);
|
||||||
header.putLong(numEntries);
|
header.putLong(numEntries);
|
||||||
header.putLong(lexicon.size());
|
header.putLong(0);
|
||||||
header.flip();
|
header.flip();
|
||||||
|
|
||||||
while (header.position() < header.limit()) {
|
while (header.position() < header.limit()) {
|
@ -1,9 +1,30 @@
|
|||||||
package nu.marginallia.index.journal;
|
package nu.marginallia.index.journal;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
public class IndexJournalFileNames {
|
public class IndexJournalFileNames {
|
||||||
public static Path resolve(Path base) {
|
public static Path allocateName(Path base, int idx) {
|
||||||
return base.resolve("page-index.dat");
|
return base.resolve(String.format("page-index-%04d.dat", idx));
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<Path> findJournalFiles(Path baseDirectory) throws IOException {
|
||||||
|
List<Path> ret = new ArrayList<>();
|
||||||
|
|
||||||
|
try (var listStream = Files.list(baseDirectory)) {
|
||||||
|
listStream
|
||||||
|
.filter(IndexJournalFileNames::isJournalFile)
|
||||||
|
.sorted()
|
||||||
|
.forEach(ret::add);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isJournalFile(Path file) {
|
||||||
|
return file.toFile().getName().matches("page-index-\\d{4}.dat");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -4,14 +4,12 @@ import nu.marginalia.index.journal.model.IndexJournalEntry;
|
|||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import org.apache.commons.lang3.tuple.Pair;
|
import org.apache.commons.lang3.tuple.Pair;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.mockito.Mockito;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -23,7 +21,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
|
|||||||
|
|
||||||
public class IndexJournalTest {
|
public class IndexJournalTest {
|
||||||
Path tempFile;
|
Path tempFile;
|
||||||
KeywordLexicon lexicon;
|
|
||||||
IndexJournalReader reader;
|
IndexJournalReader reader;
|
||||||
|
|
||||||
long firstDocId = UrlIdCodec.encodeId(44, 10);
|
long firstDocId = UrlIdCodec.encodeId(44, 10);
|
||||||
@ -32,9 +29,8 @@ public class IndexJournalTest {
|
|||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException {
|
public void setUp() throws IOException {
|
||||||
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
tempFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
||||||
lexicon = Mockito.mock(KeywordLexicon.class);
|
|
||||||
|
|
||||||
var journalWriter = new IndexJournalWriterImpl(lexicon, tempFile);
|
var journalWriter = new IndexJournalWriterSingleFileImpl( tempFile);
|
||||||
journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
|
journalWriter.put(IndexJournalEntry.builder(44, 10, 55)
|
||||||
.add(1, 2)
|
.add(1, 2)
|
||||||
.add(2, 3)
|
.add(2, 3)
|
||||||
@ -82,22 +78,7 @@ public class IndexJournalTest {
|
|||||||
List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
|
List<Integer> expected = List.of(1, 2, 3, 5, 5 ,6);
|
||||||
List<Integer> actual = new ArrayList<>();
|
List<Integer> actual = new ArrayList<>();
|
||||||
|
|
||||||
reader.forEachWordId(actual::add);
|
reader.forEachWordId(i -> actual.add((int) i));
|
||||||
assertEquals(expected, actual);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void forEachDocIdWordId() {
|
|
||||||
List<Pair<Long, Integer>> expected = List.of(
|
|
||||||
Pair.of(firstDocId, 1),
|
|
||||||
Pair.of(firstDocId, 2),
|
|
||||||
Pair.of(firstDocId, 3),
|
|
||||||
Pair.of(firstDocId, 5),
|
|
||||||
Pair.of(secondDocId, 5),
|
|
||||||
Pair.of(secondDocId, 6));
|
|
||||||
List<Pair<Long, Integer>> actual = new ArrayList<>();
|
|
||||||
|
|
||||||
reader.forEachDocIdWordId((url, word) -> actual.add(Pair.of(url, word)));
|
|
||||||
assertEquals(expected, actual);
|
assertEquals(expected, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -9,16 +9,16 @@ import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
|||||||
public interface IndexQueryBuilder {
|
public interface IndexQueryBuilder {
|
||||||
/** Filters documents that also contain termId, within the full index.
|
/** Filters documents that also contain termId, within the full index.
|
||||||
*/
|
*/
|
||||||
IndexQueryBuilder alsoFull(int termId);
|
IndexQueryBuilder alsoFull(long termId);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters documents that also contain the termId, within the priority index.
|
* Filters documents that also contain the termId, within the priority index.
|
||||||
*/
|
*/
|
||||||
IndexQueryBuilder alsoPrio(int termIds);
|
IndexQueryBuilder alsoPrio(long termIds);
|
||||||
|
|
||||||
/** Excludes documents that contain termId, within the full index
|
/** Excludes documents that contain termId, within the full index
|
||||||
*/
|
*/
|
||||||
IndexQueryBuilder notFull(int termId);
|
IndexQueryBuilder notFull(long termId);
|
||||||
|
|
||||||
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
IndexQueryBuilder addInclusionFilter(QueryFilterStepIf filterStep);
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ public class QueryFilterLetThrough implements QueryFilterStepIf {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public String describe() {
|
public String describe() {
|
||||||
return "[NoPass]";
|
return "[PassThrough]";
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -18,15 +18,15 @@ dependencies {
|
|||||||
implementation project(':code:features-index:domain-ranking')
|
implementation project(':code:features-index:domain-ranking')
|
||||||
implementation project(':code:features-index:index-query')
|
implementation project(':code:features-index:index-query')
|
||||||
implementation project(':code:features-index:index-journal')
|
implementation project(':code:features-index:index-journal')
|
||||||
implementation project(':code:features-index:lexicon')
|
|
||||||
implementation project(':code:common:model')
|
implementation project(':code:common:model')
|
||||||
implementation project(':code:common:process')
|
implementation project(':code:common:process')
|
||||||
|
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
implementation libs.prometheus
|
implementation libs.fastutil
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
testImplementation libs.bundles.slf4j.test
|
||||||
testImplementation libs.bundles.junit
|
testImplementation libs.bundles.junit
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.full;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
import nu.marginalia.btree.BTreeReader;
|
import nu.marginalia.btree.BTreeReader;
|
||||||
@ -6,18 +6,18 @@ import nu.marginalia.index.query.EntrySource;
|
|||||||
|
|
||||||
import static java.lang.Math.min;
|
import static java.lang.Math.min;
|
||||||
|
|
||||||
public class ReverseIndexFullEntrySource implements EntrySource {
|
public class ReverseIndexEntrySource implements EntrySource {
|
||||||
private final BTreeReader reader;
|
private final BTreeReader reader;
|
||||||
|
|
||||||
int pos;
|
int pos;
|
||||||
int endOffset;
|
int endOffset;
|
||||||
|
|
||||||
final int entrySize;
|
final int entrySize;
|
||||||
private final int wordId;
|
private final long wordId;
|
||||||
|
|
||||||
public ReverseIndexFullEntrySource(BTreeReader reader,
|
public ReverseIndexEntrySource(BTreeReader reader,
|
||||||
int entrySize,
|
int entrySize,
|
||||||
int wordId) {
|
long wordId) {
|
||||||
this.reader = reader;
|
this.reader = reader;
|
||||||
this.entrySize = entrySize;
|
this.entrySize = entrySize;
|
||||||
this.wordId = wordId;
|
this.wordId = wordId;
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.full;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
@ -0,0 +1,10 @@
|
|||||||
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import nu.marginalia.btree.model.BTreeBlockSize;
|
||||||
|
import nu.marginalia.btree.model.BTreeContext;
|
||||||
|
|
||||||
|
public class ReverseIndexParameters
|
||||||
|
{
|
||||||
|
public static final BTreeContext docsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||||
|
public static final BTreeContext wordsBTreeContext = new BTreeContext(5, 2, BTreeBlockSize.BS_2048);
|
||||||
|
}
|
@ -1,4 +1,4 @@
|
|||||||
package nu.marginalia.index.priority;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
|
@ -1,11 +1,11 @@
|
|||||||
package nu.marginalia.index.full;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
|
||||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
|
||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.btree.BTreeReader;
|
import nu.marginalia.btree.BTreeReader;
|
||||||
import nu.marginalia.index.query.EmptyEntrySource;
|
import nu.marginalia.index.query.EmptyEntrySource;
|
||||||
import nu.marginalia.index.query.EntrySource;
|
import nu.marginalia.index.query.EntrySource;
|
||||||
|
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||||
|
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
|
import nu.marginalia.index.query.filter.QueryFilterLetThrough;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
@ -15,18 +15,22 @@ import org.slf4j.LoggerFactory;
|
|||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
|
||||||
|
|
||||||
public class ReverseIndexFullReader {
|
public class ReverseIndexReader {
|
||||||
private final LongArray words;
|
private final LongArray words;
|
||||||
private final LongArray documents;
|
private final LongArray documents;
|
||||||
|
private final long wordsDataOffset;
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
private final BTreeReader wordsBTreeReader;
|
||||||
|
|
||||||
public ReverseIndexFullReader(Path words, Path documents) throws IOException {
|
|
||||||
|
|
||||||
|
public ReverseIndexReader(Path words, Path documents) throws IOException {
|
||||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||||
this.words = null;
|
this.words = null;
|
||||||
this.documents = null;
|
this.documents = null;
|
||||||
|
this.wordsBTreeReader = null;
|
||||||
|
this.wordsDataOffset = -1;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -34,62 +38,52 @@ public class ReverseIndexFullReader {
|
|||||||
|
|
||||||
this.words = LongArray.mmapRead(words);
|
this.words = LongArray.mmapRead(words);
|
||||||
this.documents = LongArray.mmapRead(documents);
|
this.documents = LongArray.mmapRead(documents);
|
||||||
|
|
||||||
|
wordsBTreeReader = new BTreeReader(this.words, ReverseIndexParameters.wordsBTreeContext, 0);
|
||||||
|
wordsDataOffset = wordsBTreeReader.getHeader().dataOffsetLongs();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isWordInDoc(int wordId, long documentId) {
|
|
||||||
if (wordId < 0) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
private long wordOffset(long wordId) {
|
||||||
|
long idx = wordsBTreeReader.findEntry(wordId);
|
||||||
|
|
||||||
if (offset < 0) {
|
if (idx < 0)
|
||||||
return false;
|
return -1L;
|
||||||
}
|
|
||||||
|
|
||||||
return createReaderNew(offset).findEntry(documentId) >= 0;
|
return words.get(wordsDataOffset + idx + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
public EntrySource documents(int wordId) {
|
public EntrySource documents(long wordId) {
|
||||||
if (null == words) {
|
if (null == words) {
|
||||||
logger.warn("Reverse index is not ready, dropping query");
|
logger.warn("Reverse index is not ready, dropping query");
|
||||||
return new EmptyEntrySource();
|
return new EmptyEntrySource();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
long offset = wordOffset(wordId);
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0) return new EmptyEntrySource();
|
if (offset < 0) return new EmptyEntrySource();
|
||||||
|
|
||||||
return new ReverseIndexFullEntrySource(createReaderNew(offset), ReverseIndexFullParameters.ENTRY_SIZE, wordId);
|
return new ReverseIndexEntrySource(createReaderNew(offset), 2, wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryFilterStepIf also(int wordId) {
|
public QueryFilterStepIf also(long wordId) {
|
||||||
if (wordId < 0) return new QueryFilterNoPass();
|
long offset = wordOffset(wordId);
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0) return new QueryFilterNoPass();
|
if (offset < 0) return new QueryFilterNoPass();
|
||||||
|
|
||||||
return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId);
|
return new ReverseIndexRetainFilter(createReaderNew(offset), "full", wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public QueryFilterStepIf not(int wordId) {
|
public QueryFilterStepIf not(long wordId) {
|
||||||
if (wordId < 0) return new QueryFilterLetThrough();
|
long offset = wordOffset(wordId);
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0) return new QueryFilterLetThrough();
|
if (offset < 0) return new QueryFilterLetThrough();
|
||||||
|
|
||||||
return new ReverseIndexRejectFilter(createReaderNew(offset));
|
return new ReverseIndexRejectFilter(createReaderNew(offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
public int numDocuments(int wordId) {
|
public int numDocuments(long wordId) {
|
||||||
if (wordId < 0)
|
long offset = wordOffset(wordId);
|
||||||
return 0;
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0)
|
if (offset < 0)
|
||||||
return 0;
|
return 0;
|
||||||
@ -98,15 +92,12 @@ public class ReverseIndexFullReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private BTreeReader createReaderNew(long offset) {
|
private BTreeReader createReaderNew(long offset) {
|
||||||
return new BTreeReader(documents, ReverseIndexFullParameters.bTreeContext, offset);
|
return new BTreeReader(documents, ReverseIndexParameters.docsBTreeContext, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
public long[] getTermMeta(int wordId, long[] docIds) {
|
public long[] getTermMeta(long wordId, long[] docIds) {
|
||||||
if (wordId < 0) {
|
long offset = wordOffset(wordId);
|
||||||
return new long[docIds.length];
|
|
||||||
}
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
if (offset < 0) {
|
if (offset < 0) {
|
||||||
return new long[docIds.length];
|
return new long[docIds.length];
|
||||||
}
|
}
|
@ -0,0 +1,9 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
public interface DocIdRewriter {
|
||||||
|
long rewriteDocId(long docId);
|
||||||
|
|
||||||
|
static DocIdRewriter identity() {
|
||||||
|
return l -> l;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,10 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
public interface JournalReaderSource {
|
||||||
|
IndexJournalReader construct(Path sourceFile) throws IOException;
|
||||||
|
}
|
@ -4,7 +4,6 @@ import nu.marginalia.array.LongArray;
|
|||||||
import nu.marginalia.array.functional.LongIOTransformer;
|
import nu.marginalia.array.functional.LongIOTransformer;
|
||||||
import nu.marginalia.btree.BTreeWriter;
|
import nu.marginalia.btree.BTreeWriter;
|
||||||
import nu.marginalia.btree.model.BTreeContext;
|
import nu.marginalia.btree.model.BTreeContext;
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.channels.FileChannel;
|
import java.nio.channels.FileChannel;
|
||||||
|
@ -0,0 +1,81 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class ReverseIndexConstructor {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ReverseIndexConstructor.class);
|
||||||
|
|
||||||
|
public static void createReverseIndex(
|
||||||
|
JournalReaderSource readerSource,
|
||||||
|
Path sourceBaseDir,
|
||||||
|
Path tmpDir,
|
||||||
|
Path outputFileDocs,
|
||||||
|
Path outputFileWords) throws IOException
|
||||||
|
{
|
||||||
|
var inputs = IndexJournalFileNames.findJournalFiles(sourceBaseDir);
|
||||||
|
if (inputs.isEmpty()) {
|
||||||
|
logger.error("No journal files in base dir {}", sourceBaseDir);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ReversePreindex> preindexes = new ArrayList<>();
|
||||||
|
|
||||||
|
for (var input : inputs) {
|
||||||
|
logger.info("Construcing preindex from {}", input);
|
||||||
|
var preindex = ReversePreindex.constructPreindex(readerSource.construct(input),
|
||||||
|
tmpDir, tmpDir);
|
||||||
|
preindexes.add(preindex);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info("Merging");
|
||||||
|
var finalPreindex = mergePreindexes(tmpDir, preindexes);
|
||||||
|
logger.info("Finalizing");
|
||||||
|
finalPreindex.finalizeIndex(outputFileDocs, outputFileWords);
|
||||||
|
logger.info("Done");
|
||||||
|
finalPreindex.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ReversePreindex mergePreindexes(Path workDir, List<ReversePreindex> preindexes) throws IOException {
|
||||||
|
assert !preindexes.isEmpty();
|
||||||
|
|
||||||
|
if (preindexes.size() == 1) {
|
||||||
|
logger.info("Single preindex, no merge necessary");
|
||||||
|
return preindexes.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<ReversePreindex> toMerge = new ArrayList<>(preindexes);
|
||||||
|
List<ReversePreindex> merged = new ArrayList<>();
|
||||||
|
|
||||||
|
while (toMerge.size() != 1) {
|
||||||
|
for (int i = 0; i < toMerge.size(); i+=2) {
|
||||||
|
var left = toMerge.get(i);
|
||||||
|
var right = toMerge.get(i+1);
|
||||||
|
|
||||||
|
merged.add(ReversePreindex.merge(workDir, left, right));
|
||||||
|
|
||||||
|
left.delete();
|
||||||
|
right.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
if ((toMerge.size() % 2) != 0) {
|
||||||
|
merged.add(toMerge.get(toMerge.size()-1));
|
||||||
|
}
|
||||||
|
|
||||||
|
toMerge.clear();
|
||||||
|
toMerge.addAll(merged);
|
||||||
|
merged.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
return toMerge.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,256 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import nu.marginalia.btree.BTreeWriter;
|
||||||
|
import nu.marginalia.index.ReverseIndexParameters;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
|
import static nu.marginalia.array.algo.TwoArrayOperations.*;
|
||||||
|
|
||||||
|
public class ReversePreindex {
|
||||||
|
public final ReversePreindexWordSegments segments;
|
||||||
|
public final ReversePreindexDocuments documents;
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ReversePreindex.class);
|
||||||
|
|
||||||
|
public ReversePreindex(ReversePreindexWordSegments segments, ReversePreindexDocuments documents) {
|
||||||
|
this.segments = segments;
|
||||||
|
this.documents = documents;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void finalizeIndex(Path outputFileDocs, Path outputFileWords) throws IOException {
|
||||||
|
var offsets = segments.counts;
|
||||||
|
|
||||||
|
Files.deleteIfExists(outputFileDocs);
|
||||||
|
Files.deleteIfExists(outputFileWords);
|
||||||
|
|
||||||
|
// Estimate the size of the docs index data
|
||||||
|
offsets.transformEach(0, offsets.size(), new CountToOffsetTransformer(2));
|
||||||
|
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(ReverseIndexParameters.docsBTreeContext, 2);
|
||||||
|
offsets.fold(0, 0, offsets.size(), sizeEstimator);
|
||||||
|
|
||||||
|
System.out.println("size estimate = " + sizeEstimator.size);
|
||||||
|
// Write the docs file
|
||||||
|
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
|
||||||
|
try (var intermediateDocChannel = documents.createDocumentsFileChannel()) {
|
||||||
|
offsets.transformEachIO(0, offsets.size(), new ReverseIndexBTreeTransformer(finalDocs, 2, ReverseIndexParameters.docsBTreeContext, intermediateDocChannel));
|
||||||
|
intermediateDocChannel.force(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
LongArray wordIds = segments.wordIds;
|
||||||
|
|
||||||
|
// Estimate the size of the words index data
|
||||||
|
long wordsSize = ReverseIndexParameters.wordsBTreeContext.calculateSize((int) offsets.size());
|
||||||
|
|
||||||
|
// Construct the tree
|
||||||
|
LongArray wordsArray = LongArray.mmapForWriting(outputFileWords, wordsSize);
|
||||||
|
|
||||||
|
new BTreeWriter(wordsArray, ReverseIndexParameters.wordsBTreeContext)
|
||||||
|
.write(0, (int) offsets.size(), mapRegion -> {
|
||||||
|
for (long i = 0; i < offsets.size(); i++) {
|
||||||
|
mapRegion.set(2*i, wordIds.get(i));
|
||||||
|
mapRegion.set(2*i + 1, offsets.get(i));
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
wordsArray.force();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Delete all files associated with this pre-index */
|
||||||
|
public void delete() throws IOException {
|
||||||
|
segments.delete();
|
||||||
|
documents.delete();
|
||||||
|
}
|
||||||
|
public static ReversePreindex constructPreindex(IndexJournalReader reader,
|
||||||
|
Path tempDir,
|
||||||
|
Path destDir) throws IOException
|
||||||
|
{
|
||||||
|
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||||
|
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||||
|
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||||
|
|
||||||
|
SortingContext ctx = new SortingContext(tempDir, 1<<31);
|
||||||
|
logger.info("Segmenting");
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, ctx, segmentWordsFile, segmentCountsFile);
|
||||||
|
logger.info("Mapping docs");
|
||||||
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), ctx, segments);
|
||||||
|
logger.info("Done");
|
||||||
|
return new ReversePreindex(segments, docs);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Create a segment word file with each word from both inputs, with zero counts for all the data.
|
||||||
|
* This is an intermediate product in merging.
|
||||||
|
*/
|
||||||
|
static ReversePreindexWordSegments createMergedSegmentWordFile(Path destDir,
|
||||||
|
ReversePreindexWordSegments left,
|
||||||
|
ReversePreindexWordSegments right) throws IOException {
|
||||||
|
Path segmentWordsFile = Files.createTempFile(destDir, "segment_words", ".dat");
|
||||||
|
Path segmentCountsFile = Files.createTempFile(destDir, "segment_counts", ".dat");
|
||||||
|
|
||||||
|
long segmentsSize = countDistinctElements(left.wordIds, right.wordIds,
|
||||||
|
0, left.wordIds.size(),
|
||||||
|
0, right.wordIds.size());
|
||||||
|
|
||||||
|
LongArray wordIdsFile = LongArray.mmapForWriting(segmentWordsFile, segmentsSize);
|
||||||
|
|
||||||
|
mergeArrays(wordIdsFile, left.wordIds, right.wordIds,
|
||||||
|
0, wordIdsFile.size(),
|
||||||
|
0, left.wordIds.size(),
|
||||||
|
0, right.wordIds.size());
|
||||||
|
|
||||||
|
LongArray counts = LongArray.mmapForWriting(segmentCountsFile, 8*segmentsSize);
|
||||||
|
|
||||||
|
return new ReversePreindexWordSegments(wordIdsFile, counts, segmentWordsFile, segmentCountsFile);
|
||||||
|
}
|
||||||
|
public static ReversePreindex merge(Path destDir,
|
||||||
|
ReversePreindex left,
|
||||||
|
ReversePreindex right) throws IOException {
|
||||||
|
|
||||||
|
ReversePreindexWordSegments mergingSegment = createMergedSegmentWordFile(destDir,
|
||||||
|
left.segments,
|
||||||
|
right.segments);
|
||||||
|
|
||||||
|
var mergingIter = mergingSegment.constructionIterator(2);
|
||||||
|
var leftIter = left.segments.iterator(2);
|
||||||
|
var rightIter = right.segments.iterator(2);
|
||||||
|
|
||||||
|
Path docsFile = Files.createTempFile(destDir, "docs", ".dat");
|
||||||
|
|
||||||
|
LongArray mergedDocuments = LongArray.mmapForWriting(docsFile, 8 * (left.documents.size() + right.documents.size()));
|
||||||
|
|
||||||
|
leftIter.next();
|
||||||
|
rightIter.next();
|
||||||
|
|
||||||
|
FileChannel leftChannel = left.documents.createDocumentsFileChannel();
|
||||||
|
FileChannel rightChannel = right.documents.createDocumentsFileChannel();
|
||||||
|
|
||||||
|
while (mergingIter.canPutMore()
|
||||||
|
&& leftIter.isPositionBeforeEnd()
|
||||||
|
&& rightIter.isPositionBeforeEnd())
|
||||||
|
{
|
||||||
|
if (leftIter.wordId == mergingIter.wordId
|
||||||
|
&& rightIter.wordId == mergingIter.wordId) {
|
||||||
|
mergeSegments(leftIter,
|
||||||
|
rightIter,
|
||||||
|
left.documents,
|
||||||
|
right.documents,
|
||||||
|
mergedDocuments,
|
||||||
|
mergingIter);
|
||||||
|
}
|
||||||
|
else if (leftIter.wordId == mergingIter.wordId) {
|
||||||
|
if (!copySegment(leftIter, mergedDocuments, leftChannel, mergingIter))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (rightIter.wordId == mergingIter.wordId) {
|
||||||
|
if (!copySegment(rightIter, mergedDocuments, rightChannel, mergingIter))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
assert false : "This should never happen";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (leftIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(leftIter, mergedDocuments, leftChannel, mergingIter));
|
||||||
|
|
||||||
|
}
|
||||||
|
if (rightIter.isPositionBeforeEnd()) {
|
||||||
|
while (copySegment(rightIter, mergedDocuments, rightChannel, mergingIter));
|
||||||
|
}
|
||||||
|
|
||||||
|
assert !leftIter.isPositionBeforeEnd() : "Left has more to go";
|
||||||
|
assert !rightIter.isPositionBeforeEnd() : "Right has more to go";
|
||||||
|
assert !mergingIter.canPutMore() : "Source iters ran dry before merging iter";
|
||||||
|
|
||||||
|
// We may have overestimated the size of the merged docs size in the case there were
|
||||||
|
// duplicates in the data, so we need to shrink it to the actual size we wrote.
|
||||||
|
|
||||||
|
mergedDocuments = shrinkMergedDocuments(mergedDocuments, docsFile, 2 * mergingSegment.totalSize());
|
||||||
|
|
||||||
|
return new ReversePreindex(
|
||||||
|
mergingSegment,
|
||||||
|
new ReversePreindexDocuments(mergedDocuments, docsFile)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static LongArray shrinkMergedDocuments(LongArray mergedDocuments, Path docsFile, long sizeLongs) throws IOException {
|
||||||
|
|
||||||
|
mergedDocuments.force();
|
||||||
|
|
||||||
|
long beforeSize = mergedDocuments.size();
|
||||||
|
try (var bc = Files.newByteChannel(docsFile, StandardOpenOption.WRITE)) {
|
||||||
|
bc.truncate(sizeLongs * 8);
|
||||||
|
}
|
||||||
|
long afterSize = mergedDocuments.size();
|
||||||
|
mergedDocuments = LongArray.mmapForWriting(docsFile, sizeLongs);
|
||||||
|
|
||||||
|
if (beforeSize != afterSize) {
|
||||||
|
logger.info("Shrunk {} from {}b to {}b", docsFile, beforeSize, afterSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
return mergedDocuments;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void mergeSegments(ReversePreindexWordSegments.SegmentIterator leftIter,
|
||||||
|
ReversePreindexWordSegments.SegmentIterator rightIter,
|
||||||
|
ReversePreindexDocuments left,
|
||||||
|
ReversePreindexDocuments right,
|
||||||
|
LongArray documentsFile,
|
||||||
|
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter)
|
||||||
|
{
|
||||||
|
long distinct = countDistinctElementsN(2,
|
||||||
|
left.documents,
|
||||||
|
right.documents,
|
||||||
|
leftIter.startOffset, leftIter.endOffset,
|
||||||
|
rightIter.startOffset, rightIter.endOffset);
|
||||||
|
|
||||||
|
mergeArrays2(documentsFile,
|
||||||
|
left.documents,
|
||||||
|
right.documents,
|
||||||
|
mergingIter.startOffset,
|
||||||
|
mergingIter.startOffset + 2*distinct,
|
||||||
|
leftIter.startOffset, leftIter.endOffset,
|
||||||
|
rightIter.startOffset, rightIter.endOffset);
|
||||||
|
|
||||||
|
mergingIter.putNext(distinct);
|
||||||
|
leftIter.next();
|
||||||
|
rightIter.next();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean copySegment(ReversePreindexWordSegments.SegmentIterator sourceIter,
|
||||||
|
LongArray documentsFile,
|
||||||
|
FileChannel leftChannel,
|
||||||
|
ReversePreindexWordSegments.SegmentConstructionIterator mergingIter) throws IOException {
|
||||||
|
|
||||||
|
long size = sourceIter.endOffset - sourceIter.startOffset;
|
||||||
|
long start = mergingIter.startOffset;
|
||||||
|
long end = start + size;
|
||||||
|
|
||||||
|
documentsFile.transferFrom(leftChannel,
|
||||||
|
sourceIter.startOffset,
|
||||||
|
mergingIter.startOffset,
|
||||||
|
end);
|
||||||
|
|
||||||
|
boolean putNext = mergingIter.putNext(size / 2);
|
||||||
|
boolean iterNext = sourceIter.next();
|
||||||
|
|
||||||
|
if (!putNext) {
|
||||||
|
assert !iterNext: "Source iterator ran out before dest iterator?!";
|
||||||
|
}
|
||||||
|
|
||||||
|
return iterNext;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,120 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import lombok.SneakyThrows;
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
|
/** A LongArray with document data, segmented according to
|
||||||
|
* the associated ReversePReindexWordSegments data
|
||||||
|
*/
|
||||||
|
public class ReversePreindexDocuments {
|
||||||
|
private final Path file;
|
||||||
|
public final LongArray documents;
|
||||||
|
private static final int RECORD_SIZE_LONGS = 2;
|
||||||
|
private static final Logger logger= LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
||||||
|
|
||||||
|
public ReversePreindexDocuments(LongArray documents, Path file) {
|
||||||
|
this.documents = documents;
|
||||||
|
this.file = file;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ReversePreindexDocuments construct(
|
||||||
|
Path docsFile,
|
||||||
|
IndexJournalReader reader,
|
||||||
|
DocIdRewriter docIdRewriter,
|
||||||
|
SortingContext sortingContext,
|
||||||
|
ReversePreindexWordSegments segments) throws IOException {
|
||||||
|
|
||||||
|
|
||||||
|
logger.info("Transfering data");
|
||||||
|
createUnsortedDocsFile(docsFile, reader, segments, docIdRewriter);
|
||||||
|
|
||||||
|
LongArray docsFileMap = LongArray.mmapForWriting(docsFile, 8 * Files.size(docsFile));
|
||||||
|
logger.info("Sorting data");
|
||||||
|
sortDocsFile(docsFileMap, segments, sortingContext);
|
||||||
|
|
||||||
|
return new ReversePreindexDocuments(docsFileMap, docsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public FileChannel createDocumentsFileChannel() throws IOException {
|
||||||
|
return (FileChannel) Files.newByteChannel(file, StandardOpenOption.READ);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public LongArray slice(long start, long end) {
|
||||||
|
return documents.range(start, end);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long size() {
|
||||||
|
return documents.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void createUnsortedDocsFile(Path docsFile,
|
||||||
|
IndexJournalReader reader,
|
||||||
|
ReversePreindexWordSegments segments,
|
||||||
|
DocIdRewriter docIdRewriter) throws IOException {
|
||||||
|
long fileSize = 8 * segments.totalSize();
|
||||||
|
LongArray outArray = LongArray.mmapForWriting(docsFile, fileSize);
|
||||||
|
|
||||||
|
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||||
|
offsetMap.defaultReturnValue(0);
|
||||||
|
|
||||||
|
reader.forEachDocIdRecord((docId, rec) -> {
|
||||||
|
long wordId = rec.wordId();
|
||||||
|
long meta = rec.metadata();
|
||||||
|
|
||||||
|
long rankEncodedId = docIdRewriter.rewriteDocId(docId);
|
||||||
|
|
||||||
|
long offset = offsetMap.addTo(wordId, RECORD_SIZE_LONGS);
|
||||||
|
outArray.set(offset + 0, rankEncodedId);
|
||||||
|
outArray.set(offset + 1, meta);
|
||||||
|
});
|
||||||
|
|
||||||
|
outArray.force();
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
private static void sortDocsFile(LongArray docsFileMap, ReversePreindexWordSegments segments, SortingContext sortingContext) throws IOException {
|
||||||
|
|
||||||
|
var iter = segments.iterator(RECORD_SIZE_LONGS);
|
||||||
|
|
||||||
|
ExecutorService sortingWorkers = Executors.newWorkStealingPool(Runtime.getRuntime().availableProcessors());
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
if (iter.size() < 1024) {
|
||||||
|
docsFileMap.quickSortN(RECORD_SIZE_LONGS,
|
||||||
|
iter.startOffset,
|
||||||
|
iter.endOffset);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
sortingWorkers.execute(() ->
|
||||||
|
docsFileMap.quickSortN(RECORD_SIZE_LONGS,
|
||||||
|
iter.startOffset,
|
||||||
|
iter.endOffset));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sortingWorkers.shutdown();
|
||||||
|
logger.info("Awaiting shutdown");
|
||||||
|
|
||||||
|
while (!sortingWorkers.awaitTermination(1, TimeUnit.HOURS));
|
||||||
|
|
||||||
|
sortingWorkers.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
public void delete() throws IOException {
|
||||||
|
Files.delete(this.file);
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,191 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongIterator;
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
|
/** A pair of file-backed arrays of sorted wordIds
|
||||||
|
* and the count of documents associated with each wordId.
|
||||||
|
*/
|
||||||
|
public class ReversePreindexWordSegments {
|
||||||
|
public final LongArray wordIds;
|
||||||
|
public final LongArray counts;
|
||||||
|
|
||||||
|
private final Path wordsFile;
|
||||||
|
private final Path countsFile;
|
||||||
|
|
||||||
|
public ReversePreindexWordSegments(LongArray wordIds,
|
||||||
|
LongArray counts,
|
||||||
|
Path wordsFile,
|
||||||
|
Path countsFile)
|
||||||
|
{
|
||||||
|
this.wordIds = wordIds;
|
||||||
|
this.counts = counts;
|
||||||
|
this.wordsFile = wordsFile;
|
||||||
|
this.countsFile = countsFile;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a long-long hash map where each key is a wordId,
|
||||||
|
* and each value is the start offset of the data.
|
||||||
|
*/
|
||||||
|
public Long2LongOpenHashMap asMap(int recordSize) {
|
||||||
|
Long2LongOpenHashMap ret = new Long2LongOpenHashMap((int) wordIds.size(), 0.75f);
|
||||||
|
var iter = iterator(recordSize);
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
ret.put(iter.wordId, iter.startOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ReversePreindexWordSegments construct(IndexJournalReader reader,
|
||||||
|
SortingContext ctx,
|
||||||
|
Path wordIdsFile,
|
||||||
|
Path countsFile)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
|
Long2IntOpenHashMap countsMap = new Long2IntOpenHashMap(100_000, 0.75f);
|
||||||
|
countsMap.defaultReturnValue(0);
|
||||||
|
reader.forEachWordId(wordId -> countsMap.addTo(wordId, 1));
|
||||||
|
|
||||||
|
LongArray words = LongArray.mmapForWriting(wordIdsFile, countsMap.size());
|
||||||
|
LongArray counts = LongArray.mmapForWriting(countsFile, countsMap.size());
|
||||||
|
|
||||||
|
// Create the words file by iterating over the map and inserting them into
|
||||||
|
// the words file in whatever bizarro hash table order they appear in
|
||||||
|
int i = 0;
|
||||||
|
LongIterator iter = countsMap.keySet().iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
words.set(i, iter.nextLong());
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Sort the words file
|
||||||
|
words.sortLargeSpan(ctx, 0, counts.size());
|
||||||
|
|
||||||
|
// Populate the counts
|
||||||
|
for (i = 0; i < countsMap.size(); i++) {
|
||||||
|
counts.set(i, countsMap.get(words.get(i)));
|
||||||
|
}
|
||||||
|
|
||||||
|
return new ReversePreindexWordSegments(words, counts, wordIdsFile, countsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public SegmentIterator iterator(int recordSize) {
|
||||||
|
return new SegmentIterator(recordSize);
|
||||||
|
}
|
||||||
|
public SegmentConstructionIterator constructionIterator(int recordSize) {
|
||||||
|
return new SegmentConstructionIterator(recordSize);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long totalSize() {
|
||||||
|
return counts.fold(0, 0, counts.size(), Long::sum);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void delete() throws IOException {
|
||||||
|
Files.delete(countsFile);
|
||||||
|
Files.delete(wordsFile);
|
||||||
|
}
|
||||||
|
|
||||||
|
public class SegmentIterator {
|
||||||
|
private final int recordSize;
|
||||||
|
private final long fileSize;
|
||||||
|
long wordId;
|
||||||
|
long startOffset = 0;
|
||||||
|
long endOffset = 0;
|
||||||
|
|
||||||
|
private SegmentIterator(int recordSize) {
|
||||||
|
this.recordSize = recordSize;
|
||||||
|
this.fileSize = wordIds.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
private int i = -1;
|
||||||
|
public int idx() {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
public boolean next() {
|
||||||
|
if (++i >= fileSize) {
|
||||||
|
wordId = Long.MIN_VALUE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
wordId = wordIds.get(i);
|
||||||
|
startOffset = endOffset;
|
||||||
|
endOffset = startOffset + recordSize * counts.get(i);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean hasMorePositions() {
|
||||||
|
return i + 1 < wordIds.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isPositionBeforeEnd() {
|
||||||
|
return i < wordIds.size();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long size() {
|
||||||
|
return endOffset - startOffset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class SegmentConstructionIterator {
|
||||||
|
private final int recordSize;
|
||||||
|
private final long fileSize;
|
||||||
|
long wordId;
|
||||||
|
long startOffset = 0;
|
||||||
|
long endOffset = 0;
|
||||||
|
|
||||||
|
private SegmentConstructionIterator(int recordSize) {
|
||||||
|
this.recordSize = recordSize;
|
||||||
|
this.fileSize = wordIds.size();
|
||||||
|
if (fileSize == 0) {
|
||||||
|
throw new IllegalArgumentException("Cannot construct zero-length word segment file");
|
||||||
|
}
|
||||||
|
this.wordId = wordIds.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int i = 0;
|
||||||
|
public int idx() {
|
||||||
|
return i;
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean putNext(long size) {
|
||||||
|
|
||||||
|
if (i >= fileSize)
|
||||||
|
return false;
|
||||||
|
|
||||||
|
endOffset = startOffset + recordSize * size;
|
||||||
|
counts.set(i, size);
|
||||||
|
startOffset = endOffset;
|
||||||
|
endOffset = -1;
|
||||||
|
|
||||||
|
i++;
|
||||||
|
|
||||||
|
if (i == fileSize) {
|
||||||
|
// We've reached the end of the iteration and there is no
|
||||||
|
// "next" wordId to fetch
|
||||||
|
wordId = Long.MIN_VALUE;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
wordId = wordIds.get(i);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean canPutMore() {
|
||||||
|
return i < wordIds.size();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,206 +0,0 @@
|
|||||||
package nu.marginalia.index.full;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
|
||||||
import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
|
|
||||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.rwf.RandomWriteFunnel;
|
|
||||||
import nu.marginalia.array.IntArray;
|
|
||||||
import nu.marginalia.array.LongArray;
|
|
||||||
import nu.marginalia.array.algo.SortingContext;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.StandardOpenOption;
|
|
||||||
|
|
||||||
import static nu.marginalia.index.full.ReverseIndexFullParameters.bTreeContext;
|
|
||||||
|
|
||||||
public class ReverseIndexFullConverter {
|
|
||||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
|
||||||
|
|
||||||
private final ProcessHeartbeat heartbeat;
|
|
||||||
private final Path tmpFileDir;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final IndexJournalReader journalReader;
|
|
||||||
private final DomainRankings domainRankings;
|
|
||||||
private final Path outputFileWords;
|
|
||||||
private final Path outputFileDocs;
|
|
||||||
private final SortingContext sortingContext;
|
|
||||||
|
|
||||||
public ReverseIndexFullConverter(ProcessHeartbeat heartbeat,
|
|
||||||
Path tmpFileDir,
|
|
||||||
IndexJournalReader journalReader,
|
|
||||||
DomainRankings domainRankings,
|
|
||||||
Path outputFileWords,
|
|
||||||
Path outputFileDocs) {
|
|
||||||
this.heartbeat = heartbeat;
|
|
||||||
this.tmpFileDir = tmpFileDir;
|
|
||||||
this.journalReader = journalReader;
|
|
||||||
this.domainRankings = domainRankings;
|
|
||||||
this.outputFileWords = outputFileWords;
|
|
||||||
this.outputFileDocs = outputFileDocs;
|
|
||||||
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum TaskSteps {
|
|
||||||
ACCUMULATE_STATISTICS,
|
|
||||||
INCREMENT_OFFSETS,
|
|
||||||
COUNT_OFFSETS,
|
|
||||||
CREATE_INTERMEDIATE_DOCS,
|
|
||||||
SORT_INTERMEDIATE_DOCS,
|
|
||||||
SIZING,
|
|
||||||
FINALIZING_DOCS,
|
|
||||||
FORCE,
|
|
||||||
FINISHED,
|
|
||||||
}
|
|
||||||
|
|
||||||
public void convert() throws IOException {
|
|
||||||
deleteOldFiles();
|
|
||||||
|
|
||||||
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
|
||||||
logger.warn("Bailing: Journal is empty!");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
|
||||||
|
|
||||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "reverseIndexFullConverter")) {
|
|
||||||
progress.progress(TaskSteps.ACCUMULATE_STATISTICS);
|
|
||||||
|
|
||||||
final IndexJournalStatistics statistics = journalReader.getStatistics();
|
|
||||||
final long wordsFileSize = statistics.highestWord() + 1;
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.INCREMENT_OFFSETS);
|
|
||||||
|
|
||||||
logger.debug("Words file size: {}", wordsFileSize);
|
|
||||||
// Create a count of how many documents has contains each word
|
|
||||||
final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
|
|
||||||
|
|
||||||
journalReader.forEachWordId(wordsOffsets::increment);
|
|
||||||
progress.progress(TaskSteps.COUNT_OFFSETS);
|
|
||||||
|
|
||||||
wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexFullParameters.ENTRY_SIZE));
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS);
|
|
||||||
|
|
||||||
// Construct an intermediate representation of the reverse documents index
|
|
||||||
try (FileChannel intermediateDocChannel =
|
|
||||||
(FileChannel) Files.newByteChannel(intermediateUrlsFile,
|
|
||||||
StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
|
|
||||||
{
|
|
||||||
|
|
||||||
// Construct intermediate index
|
|
||||||
try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
|
|
||||||
IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
|
|
||||||
)
|
|
||||||
{
|
|
||||||
journalReader.forEachDocIdRecord(intermediateIndexConstructor);
|
|
||||||
intermediateDocumentWriteFunnel.write(intermediateDocChannel);
|
|
||||||
}
|
|
||||||
intermediateDocChannel.force(false);
|
|
||||||
progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS);
|
|
||||||
|
|
||||||
// Sort each segment of the intermediate file
|
|
||||||
{
|
|
||||||
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
|
|
||||||
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
|
|
||||||
intermediateDocs.sortLargeSpanN(sortingContext, ReverseIndexFullParameters.ENTRY_SIZE, s, e);
|
|
||||||
return e;
|
|
||||||
});
|
|
||||||
intermediateDocs.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.SIZING);
|
|
||||||
|
|
||||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
|
|
||||||
ReverseIndexFullParameters.bTreeContext,
|
|
||||||
ReverseIndexFullParameters.ENTRY_SIZE);
|
|
||||||
|
|
||||||
wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
|
|
||||||
progress.progress(TaskSteps.FINALIZING_DOCS);
|
|
||||||
|
|
||||||
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
|
|
||||||
// Construct the proper reverse index
|
|
||||||
wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexFullParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
|
|
||||||
wordsOffsets.write(outputFileWords);
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.FORCE);
|
|
||||||
|
|
||||||
// Attempt to clean up before forcing (important disk space preservation)
|
|
||||||
Files.deleteIfExists(intermediateUrlsFile);
|
|
||||||
|
|
||||||
wordsOffsets.force();
|
|
||||||
finalDocs.force();
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.FINISHED);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (IOException ex) {
|
|
||||||
logger.error("Failed to convert", ex);
|
|
||||||
throw ex;
|
|
||||||
} finally {
|
|
||||||
Files.deleteIfExists(intermediateUrlsFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void deleteOldFiles() throws IOException {
|
|
||||||
Files.deleteIfExists(outputFileWords);
|
|
||||||
Files.deleteIfExists(outputFileDocs);
|
|
||||||
}
|
|
||||||
|
|
||||||
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer<IndexJournalEntryData.Record>, AutoCloseable {
|
|
||||||
|
|
||||||
private final LongArray wordRangeEnds;
|
|
||||||
private final IntArray wordRangeOffset;
|
|
||||||
private final RandomWriteFunnel documentsFile;
|
|
||||||
|
|
||||||
private final Path tempFile;
|
|
||||||
|
|
||||||
public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
|
|
||||||
tempFile = Files.createTempFile(tempDir, "iic", "dat");
|
|
||||||
|
|
||||||
this.wordRangeEnds = wordRangeEnds;
|
|
||||||
this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
|
|
||||||
this.documentsFile = documentsFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public void accept(long docId, IndexJournalEntryData.Record record) {
|
|
||||||
int domainId = UrlIdCodec.getDomainId(docId);
|
|
||||||
float rankingPart = domainRankings.getSortRanking(domainId);
|
|
||||||
long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId);
|
|
||||||
|
|
||||||
final int wordId = record.wordId();
|
|
||||||
long offset = startOfRange(wordId);
|
|
||||||
|
|
||||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
|
|
||||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), record.metadata());
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
private long startOfRange(int wordId) {
|
|
||||||
if (wordId == 0) return 0;
|
|
||||||
|
|
||||||
return wordRangeEnds.get(wordId - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws IOException {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
|||||||
package nu.marginalia.index.full;
|
|
||||||
|
|
||||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
|
||||||
import nu.marginalia.btree.model.BTreeContext;
|
|
||||||
|
|
||||||
public class ReverseIndexFullParameters {
|
|
||||||
static final int ENTRY_SIZE = 2;
|
|
||||||
|
|
||||||
// This is the byte size per index page on disk, the data pages are twice as large due to ENTRY_SIZE = 2.
|
|
||||||
//
|
|
||||||
// Given a hardware limit of 4k reads, 2k block size should be optimal.
|
|
||||||
static final BTreeBlockSize blockSize = BTreeBlockSize.BS_2048;
|
|
||||||
|
|
||||||
|
|
||||||
static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
|
|
||||||
}
|
|
@ -1,204 +0,0 @@
|
|||||||
package nu.marginalia.index.priority;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.array.IntArray;
|
|
||||||
import nu.marginalia.array.LongArray;
|
|
||||||
import nu.marginalia.array.algo.SortingContext;
|
|
||||||
import nu.marginalia.index.construction.CountToOffsetTransformer;
|
|
||||||
import nu.marginalia.index.construction.ReverseIndexBTreeTransformer;
|
|
||||||
import nu.marginalia.index.construction.IndexSizeEstimator;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalStatistics;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.rwf.RandomWriteFunnel;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.StandardOpenOption;
|
|
||||||
|
|
||||||
import static nu.marginalia.index.priority.ReverseIndexPriorityParameters.bTreeContext;
|
|
||||||
|
|
||||||
public class ReverseIndexPriorityConverter {
|
|
||||||
private static final int RWF_BIN_SIZE = 10_000_000;
|
|
||||||
|
|
||||||
private final ProcessHeartbeat heartbeat;
|
|
||||||
private final Path tmpFileDir;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final IndexJournalReader journalReader;
|
|
||||||
private final DomainRankings domainRankings;
|
|
||||||
private final Path outputFileWords;
|
|
||||||
private final Path outputFileDocs;
|
|
||||||
private final SortingContext sortingContext;
|
|
||||||
|
|
||||||
public ReverseIndexPriorityConverter(ProcessHeartbeat heartbeat,
|
|
||||||
Path tmpFileDir,
|
|
||||||
IndexJournalReader journalReader,
|
|
||||||
DomainRankings domainRankings,
|
|
||||||
Path outputFileWords,
|
|
||||||
Path outputFileDocs) {
|
|
||||||
this.heartbeat = heartbeat;
|
|
||||||
this.tmpFileDir = tmpFileDir;
|
|
||||||
this.journalReader = journalReader;
|
|
||||||
this.domainRankings = domainRankings;
|
|
||||||
this.outputFileWords = outputFileWords;
|
|
||||||
this.outputFileDocs = outputFileDocs;
|
|
||||||
this.sortingContext = new SortingContext(tmpFileDir, 64_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
public enum TaskSteps {
|
|
||||||
ACCUMULATE_STATISTICS,
|
|
||||||
INCREMENT_OFFSETS,
|
|
||||||
COUNT_OFFSETS,
|
|
||||||
CREATE_INTERMEDIATE_DOCS,
|
|
||||||
SORT_INTERMEDIATE_DOCS,
|
|
||||||
SIZING,
|
|
||||||
FINALIZING_DOCS,
|
|
||||||
FORCE,
|
|
||||||
FINISHED,
|
|
||||||
}
|
|
||||||
|
|
||||||
public void convert() throws IOException {
|
|
||||||
deleteOldFiles();
|
|
||||||
|
|
||||||
if (journalReader.fileHeader().fileSize() <= IndexJournalReader.FILE_HEADER_SIZE_BYTES) {
|
|
||||||
logger.warn("Bailing: Journal is empty!");
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Path intermediateUrlsFile = Files.createTempFile(tmpFileDir, "urls-sorted", ".dat");
|
|
||||||
|
|
||||||
try (var progress = heartbeat.createProcessTaskHeartbeat(TaskSteps.class, "reverseIndexPriorityConverter")) {
|
|
||||||
progress.progress(TaskSteps.ACCUMULATE_STATISTICS);
|
|
||||||
|
|
||||||
final IndexJournalStatistics statistics = journalReader.getStatistics();
|
|
||||||
final long wordsFileSize = statistics.highestWord() + 1;
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.INCREMENT_OFFSETS);
|
|
||||||
|
|
||||||
logger.debug("Words file size: {}", wordsFileSize);
|
|
||||||
// Create a count of how many documents has contains each word
|
|
||||||
final LongArray wordsOffsets = LongArray.allocate(wordsFileSize);
|
|
||||||
|
|
||||||
journalReader.forEachWordId(wordsOffsets::increment);
|
|
||||||
progress.progress(TaskSteps.COUNT_OFFSETS);
|
|
||||||
|
|
||||||
wordsOffsets.transformEach(0, wordsFileSize, new CountToOffsetTransformer(ReverseIndexPriorityParameters.ENTRY_SIZE));
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.CREATE_INTERMEDIATE_DOCS);
|
|
||||||
|
|
||||||
// Construct an intermediate representation of the reverse documents index
|
|
||||||
try (FileChannel intermediateDocChannel =
|
|
||||||
(FileChannel) Files.newByteChannel(intermediateUrlsFile,
|
|
||||||
StandardOpenOption.CREATE, StandardOpenOption.READ, StandardOpenOption.WRITE))
|
|
||||||
{
|
|
||||||
|
|
||||||
// Construct intermediate index
|
|
||||||
try (RandomWriteFunnel intermediateDocumentWriteFunnel = new RandomWriteFunnel(tmpFileDir, RWF_BIN_SIZE);
|
|
||||||
IntermediateIndexConstructor intermediateIndexConstructor = new IntermediateIndexConstructor(tmpFileDir, wordsOffsets, intermediateDocumentWriteFunnel)
|
|
||||||
)
|
|
||||||
{
|
|
||||||
journalReader.forEachDocIdRecord(intermediateIndexConstructor);
|
|
||||||
intermediateDocumentWriteFunnel.write(intermediateDocChannel);
|
|
||||||
}
|
|
||||||
intermediateDocChannel.force(false);
|
|
||||||
progress.progress(TaskSteps.SORT_INTERMEDIATE_DOCS);
|
|
||||||
|
|
||||||
// Sort each segment of the intermediate file
|
|
||||||
{
|
|
||||||
LongArray intermediateDocs = LongArray.mmapForModifying(intermediateUrlsFile);
|
|
||||||
wordsOffsets.foldIO(0, 0, wordsFileSize, (s, e) -> {
|
|
||||||
intermediateDocs.sortLargeSpan(sortingContext, s, e);
|
|
||||||
return e;
|
|
||||||
});
|
|
||||||
intermediateDocs.force();
|
|
||||||
}
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.SIZING);
|
|
||||||
|
|
||||||
IndexSizeEstimator sizeEstimator = new IndexSizeEstimator(
|
|
||||||
bTreeContext,
|
|
||||||
ReverseIndexPriorityParameters.ENTRY_SIZE);
|
|
||||||
|
|
||||||
wordsOffsets.fold(0, 0, wordsOffsets.size(), sizeEstimator);
|
|
||||||
progress.progress(TaskSteps.FINALIZING_DOCS);
|
|
||||||
|
|
||||||
LongArray finalDocs = LongArray.mmapForWriting(outputFileDocs, sizeEstimator.size);
|
|
||||||
// Construct the proper reverse index
|
|
||||||
wordsOffsets.transformEachIO(0, wordsOffsets.size(), new ReverseIndexBTreeTransformer(finalDocs, ReverseIndexPriorityParameters.ENTRY_SIZE, bTreeContext, intermediateDocChannel));
|
|
||||||
wordsOffsets.write(outputFileWords);
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.FORCE);
|
|
||||||
|
|
||||||
// Attempt to clean up before forcing (important disk space preservation)
|
|
||||||
Files.deleteIfExists(intermediateUrlsFile);
|
|
||||||
|
|
||||||
wordsOffsets.force();
|
|
||||||
finalDocs.force();
|
|
||||||
|
|
||||||
progress.progress(TaskSteps.FINISHED);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (IOException ex) {
|
|
||||||
logger.error("Failed to convert", ex);
|
|
||||||
throw ex;
|
|
||||||
} finally {
|
|
||||||
Files.deleteIfExists(intermediateUrlsFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void deleteOldFiles() throws IOException {
|
|
||||||
Files.deleteIfExists(outputFileWords);
|
|
||||||
Files.deleteIfExists(outputFileDocs);
|
|
||||||
}
|
|
||||||
|
|
||||||
private class IntermediateIndexConstructor implements IndexJournalReader.LongObjectConsumer<IndexJournalEntryData.Record>, AutoCloseable {
|
|
||||||
|
|
||||||
private final LongArray wordRangeEnds;
|
|
||||||
private final IntArray wordRangeOffset;
|
|
||||||
private final RandomWriteFunnel documentsFile;
|
|
||||||
|
|
||||||
private final Path tempFile;
|
|
||||||
|
|
||||||
public IntermediateIndexConstructor(Path tempDir, LongArray wordRangeEnds, RandomWriteFunnel documentsFile) throws IOException {
|
|
||||||
tempFile = Files.createTempFile(tempDir, "iic", "dat");
|
|
||||||
|
|
||||||
this.wordRangeEnds = wordRangeEnds;
|
|
||||||
this.wordRangeOffset = IntArray.mmapForWriting(tempFile, wordRangeEnds.size());
|
|
||||||
this.documentsFile = documentsFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
@Override
|
|
||||||
public void accept(long docId, IndexJournalEntryData.Record record) {
|
|
||||||
int domainId = UrlIdCodec.getDomainId(docId);
|
|
||||||
float rankingPart = domainRankings.getSortRanking(domainId);
|
|
||||||
long rankEncodedId = UrlIdCodec.addRank(rankingPart, docId);
|
|
||||||
|
|
||||||
final int wordId = record.wordId();
|
|
||||||
long offset = startOfRange(wordId);
|
|
||||||
|
|
||||||
documentsFile.put(offset + wordRangeOffset.getAndIncrement(wordId), rankEncodedId);
|
|
||||||
}
|
|
||||||
|
|
||||||
private long startOfRange(int wordId) {
|
|
||||||
if (wordId == 0) return 0;
|
|
||||||
|
|
||||||
return wordRangeEnds.get(wordId - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws IOException {
|
|
||||||
Files.delete(tempFile);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
@ -1,48 +0,0 @@
|
|||||||
package nu.marginalia.index.priority;
|
|
||||||
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
|
||||||
import nu.marginalia.btree.BTreeReader;
|
|
||||||
import nu.marginalia.index.query.EntrySource;
|
|
||||||
|
|
||||||
import static java.lang.Math.min;
|
|
||||||
|
|
||||||
public class ReverseIndexPriorityEntrySource implements EntrySource {
|
|
||||||
private final BTreeReader reader;
|
|
||||||
|
|
||||||
int pos;
|
|
||||||
int endOffset;
|
|
||||||
|
|
||||||
private final int wordId;
|
|
||||||
|
|
||||||
public ReverseIndexPriorityEntrySource(BTreeReader reader, int wordId) {
|
|
||||||
this.reader = reader;
|
|
||||||
this.wordId = wordId;
|
|
||||||
|
|
||||||
pos = 0;
|
|
||||||
endOffset = pos + reader.numEntries();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void skip(int n) {
|
|
||||||
pos += n;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void read(LongQueryBuffer buffer) {
|
|
||||||
buffer.end = min(buffer.end, endOffset - pos);
|
|
||||||
reader.readData(buffer.data, buffer.end, pos);
|
|
||||||
pos += buffer.end;
|
|
||||||
|
|
||||||
buffer.uniq();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasMore() {
|
|
||||||
return pos < endOffset;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String indexName() {
|
|
||||||
return "Priority:" + wordId;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,31 +0,0 @@
|
|||||||
package nu.marginalia.index.priority;
|
|
||||||
|
|
||||||
import nu.marginalia.btree.model.BTreeBlockSize;
|
|
||||||
import nu.marginalia.btree.model.BTreeContext;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
|
||||||
|
|
||||||
public class ReverseIndexPriorityParameters {
|
|
||||||
static final int ENTRY_SIZE = 1;
|
|
||||||
static final BTreeBlockSize blockSize = BTreeBlockSize.BS_4096;
|
|
||||||
|
|
||||||
static final BTreeContext bTreeContext = new BTreeContext(5, ENTRY_SIZE, blockSize);
|
|
||||||
|
|
||||||
private static final long highPriorityFlags =
|
|
||||||
WordFlags.Title.asBit()
|
|
||||||
| WordFlags.Subjects.asBit()
|
|
||||||
| WordFlags.TfIdfHigh.asBit()
|
|
||||||
| WordFlags.NamesWords.asBit()
|
|
||||||
| WordFlags.UrlDomain.asBit()
|
|
||||||
| WordFlags.UrlPath.asBit()
|
|
||||||
| WordFlags.Site.asBit()
|
|
||||||
| WordFlags.SiteAdjacent.asBit();
|
|
||||||
|
|
||||||
public static boolean filterPriorityRecord(IndexJournalEntryData.Record record) {
|
|
||||||
long meta = record.metadata();
|
|
||||||
|
|
||||||
return (meta & highPriorityFlags) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
|
@ -1,77 +0,0 @@
|
|||||||
package nu.marginalia.index.priority;
|
|
||||||
|
|
||||||
import nu.marginalia.index.query.EntrySource;
|
|
||||||
import nu.marginalia.array.LongArray;
|
|
||||||
import nu.marginalia.btree.BTreeReader;
|
|
||||||
import nu.marginalia.index.query.EmptyEntrySource;
|
|
||||||
import nu.marginalia.index.query.ReverseIndexRetainFilter;
|
|
||||||
import nu.marginalia.index.query.filter.QueryFilterNoPass;
|
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
public class ReverseIndexPriorityReader {
|
|
||||||
private final LongArray words;
|
|
||||||
private final LongArray documents;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
public ReverseIndexPriorityReader(Path words, Path documents) throws IOException {
|
|
||||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
|
||||||
this.words = null;
|
|
||||||
this.documents = null;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Switching prio reverse index");
|
|
||||||
|
|
||||||
this.words = LongArray.mmapRead(words);
|
|
||||||
this.documents = LongArray.mmapRead(documents);
|
|
||||||
}
|
|
||||||
|
|
||||||
public EntrySource priorityDocuments(int wordId) {
|
|
||||||
if (words == null) {
|
|
||||||
// index not loaded
|
|
||||||
return new EmptyEntrySource();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wordId < 0 || wordId >= words.size()) return new EmptyEntrySource();
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0) return new EmptyEntrySource();
|
|
||||||
|
|
||||||
return new ReverseIndexPriorityEntrySource(createReaderNew(offset), wordId);
|
|
||||||
}
|
|
||||||
|
|
||||||
private BTreeReader createReaderNew(long offset) {
|
|
||||||
return new BTreeReader(documents, ReverseIndexPriorityParameters.bTreeContext, offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
public QueryFilterStepIf also(int wordId) {
|
|
||||||
if (wordId < 0) return new QueryFilterNoPass();
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0) return new QueryFilterNoPass();
|
|
||||||
|
|
||||||
return new ReverseIndexRetainFilter(createReaderNew(offset), "priority", wordId);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int numDocuments(int wordId) {
|
|
||||||
if (wordId < 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
long offset = words.get(wordId);
|
|
||||||
|
|
||||||
if (offset < 0)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return createReaderNew(offset).numEntries();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -4,7 +4,7 @@ import nu.marginalia.array.buffer.LongQueryBuffer;
|
|||||||
import nu.marginalia.btree.BTreeReader;
|
import nu.marginalia.btree.BTreeReader;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
|
|
||||||
public record ReverseIndexRetainFilter(BTreeReader range, String name, int wordId) implements QueryFilterStepIf {
|
public record ReverseIndexRetainFilter(BTreeReader range, String name, long wordId) implements QueryFilterStepIf {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void apply(LongQueryBuffer buffer) {
|
public void apply(LongQueryBuffer buffer) {
|
||||||
|
@ -0,0 +1,108 @@
|
|||||||
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import nu.marginalia.array.buffer.LongQueryBuffer;
|
||||||
|
import nu.marginalia.index.construction.ReversePreindex;
|
||||||
|
import nu.marginalia.index.construction.TestJournalFactory;
|
||||||
|
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static nu.marginalia.index.construction.TestJournalFactory.wm;
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class ReverseIndexReaderTest {
|
||||||
|
TestJournalFactory journalFactory;
|
||||||
|
Path tempDir;
|
||||||
|
SortingContext sortingContext;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
journalFactory = new TestJournalFactory();
|
||||||
|
|
||||||
|
tempDir = Files.createTempDirectory("sort");
|
||||||
|
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
journalFactory.clear();
|
||||||
|
|
||||||
|
List<Path> contents = new ArrayList<>();
|
||||||
|
Files.list(tempDir).forEach(contents::add);
|
||||||
|
for (var tempFile : contents) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimple() throws IOException {
|
||||||
|
|
||||||
|
var indexReader = createIndex(
|
||||||
|
new EntryDataWithWordMeta(100, 101, wm(50, 51))
|
||||||
|
);
|
||||||
|
|
||||||
|
assertEquals(1, indexReader.numDocuments(50));
|
||||||
|
|
||||||
|
long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
|
||||||
|
assertArrayEquals(new long[] { 51 }, meta);
|
||||||
|
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void test2x2() throws IOException {
|
||||||
|
|
||||||
|
var indexReader = createIndex(
|
||||||
|
new EntryDataWithWordMeta(100, 101, wm(50, 51), wm(51, 52)),
|
||||||
|
new EntryDataWithWordMeta(101, 101, wm(51, 53), wm(52, 54))
|
||||||
|
|
||||||
|
);
|
||||||
|
|
||||||
|
assertEquals(1, indexReader.numDocuments(50));
|
||||||
|
assertEquals(2, indexReader.numDocuments(51));
|
||||||
|
assertEquals(1, indexReader.numDocuments(52));
|
||||||
|
|
||||||
|
assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
|
||||||
|
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||||
|
|
||||||
|
assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
|
||||||
|
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
|
||||||
|
|
||||||
|
assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
|
||||||
|
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private long[] readEntries(ReverseIndexReader reader, long wordId) {
|
||||||
|
var es = reader.documents(wordId);
|
||||||
|
assertTrue(es.hasMore());
|
||||||
|
LongQueryBuffer buffer = new LongQueryBuffer(4);
|
||||||
|
es.read(buffer);
|
||||||
|
assertFalse(es.hasMore());
|
||||||
|
return buffer.copyData();
|
||||||
|
}
|
||||||
|
|
||||||
|
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||||
|
var reader = journalFactory.createReader(scenario);
|
||||||
|
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
|
||||||
|
|
||||||
|
|
||||||
|
Path docsFile = tempDir.resolve("docs.dat");
|
||||||
|
Path wordsFile = tempDir.resolve("words.dat");
|
||||||
|
|
||||||
|
preindex.finalizeIndex(docsFile, wordsFile);
|
||||||
|
preindex.delete();
|
||||||
|
|
||||||
|
return new ReverseIndexReader(wordsFile, docsFile);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,173 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static nu.marginalia.index.construction.TestJournalFactory.EntryData;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class ReversePreindexDocsTest {
|
||||||
|
Path countsFile;
|
||||||
|
Path wordsIdFile;
|
||||||
|
Path docsFile;
|
||||||
|
Path tempDir;
|
||||||
|
SortingContext sortingContext;
|
||||||
|
|
||||||
|
TestJournalFactory journalFactory;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
journalFactory = new TestJournalFactory();
|
||||||
|
|
||||||
|
countsFile = Files.createTempFile("counts", ".dat");
|
||||||
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
|
tempDir = Files.createTempDirectory("sort");
|
||||||
|
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
journalFactory.clear();
|
||||||
|
|
||||||
|
Files.deleteIfExists(countsFile);
|
||||||
|
Files.deleteIfExists(wordsIdFile);
|
||||||
|
List<Path> contents = new ArrayList<>();
|
||||||
|
Files.list(tempDir).forEach(contents::add);
|
||||||
|
for (var tempFile : contents) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDocs() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(-100, 0, 2, new long[] { -0xF00BA3L, 0 }),
|
||||||
|
new TestSegmentData(10, 2, 4, new long[] { -0xF00BA3L, 0 }),
|
||||||
|
new TestSegmentData(33, 4, 6, new long[] { -0xF00BA3L, 0 }),
|
||||||
|
new TestSegmentData(40, 6, 8, new long[] { -0xF00BA3L, 0 })
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
var iter = segments.iterator(2);
|
||||||
|
while (iter.next()) {
|
||||||
|
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||||
|
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||||
|
data));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDocsRepeatedWord() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 4, 4)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(4, 0, 4, new long[] { -0xF00BA3L, 0, -0xF00BA3L, 0 })
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
var iter = segments.iterator(2);
|
||||||
|
while (iter.next()) {
|
||||||
|
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||||
|
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||||
|
data));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testDocs2() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
|
||||||
|
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var docs = ReversePreindexDocuments.construct(docsFile, reader, DocIdRewriter.identity(), sortingContext, segments);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(-100, 0, 4, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0 }),
|
||||||
|
new TestSegmentData(10, 4, 6, new long[] { -0xF00BA3L, 0}),
|
||||||
|
new TestSegmentData(15, 6, 8, new long[] { 0xF00BA4L, 0}),
|
||||||
|
new TestSegmentData(30, 8, 10, new long[] { 0xF00BA4L, 0}),
|
||||||
|
new TestSegmentData(33, 10, 14, new long[] { -0xF00BA3L, 0, 0xF00BA4L, 0}),
|
||||||
|
new TestSegmentData(40, 14, 16, new long[] { -0xF00BA3L, 0})
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
var iter = segments.iterator(2);
|
||||||
|
while (iter.next()) {
|
||||||
|
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||||
|
docs.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||||
|
data));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record TestSegmentData(long wordId, long start, long end, long[] data) {
|
||||||
|
public TestSegmentData(long wordId, long start, long end) {
|
||||||
|
this(wordId, start, end, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
TestSegmentData that = (TestSegmentData) o;
|
||||||
|
|
||||||
|
if (wordId != that.wordId) return false;
|
||||||
|
if (start != that.start) return false;
|
||||||
|
if (end != that.end) return false;
|
||||||
|
return Arrays.equals(data, that.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int result = (int) (wordId ^ (wordId >>> 32));
|
||||||
|
result = 31 * result + (int) (start ^ (start >>> 32));
|
||||||
|
result = 31 * result + (int) (end ^ (end >>> 32));
|
||||||
|
result = 31 * result + Arrays.hashCode(data);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "TestSegmentData{" +
|
||||||
|
"wordId=" + wordId +
|
||||||
|
", start=" + start +
|
||||||
|
", end=" + end +
|
||||||
|
", data=" + Arrays.toString(data) +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,143 @@
|
|||||||
|
|
||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import nu.marginalia.btree.BTreeReader;
|
||||||
|
import nu.marginalia.btree.model.BTreeHeader;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
|
||||||
|
class ReversePreindexFinalizeTest {
|
||||||
|
TestJournalFactory journalFactory;
|
||||||
|
Path countsFile;
|
||||||
|
Path wordsIdFile;
|
||||||
|
Path docsFile;
|
||||||
|
Path tempDir;
|
||||||
|
SortingContext sortingContext;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
journalFactory = new TestJournalFactory();
|
||||||
|
|
||||||
|
countsFile = Files.createTempFile("counts", ".dat");
|
||||||
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
|
tempDir = Files.createTempDirectory("sort");
|
||||||
|
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
journalFactory.clear();
|
||||||
|
|
||||||
|
Files.deleteIfExists(countsFile);
|
||||||
|
Files.deleteIfExists(wordsIdFile);
|
||||||
|
List<Path> contents = new ArrayList<>();
|
||||||
|
Files.list(tempDir).forEach(contents::add);
|
||||||
|
for (var tempFile : contents) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFinalizeSimple() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(new EntryDataWithWordMeta(100, 101, wm(50, 51)));
|
||||||
|
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
|
||||||
|
|
||||||
|
|
||||||
|
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||||
|
preindex.delete();
|
||||||
|
|
||||||
|
Path wordsFile = tempDir.resolve("words.dat");
|
||||||
|
Path docsFile = tempDir.resolve("docs.dat");
|
||||||
|
|
||||||
|
assertTrue(Files.exists(wordsFile));
|
||||||
|
assertTrue(Files.exists(docsFile));
|
||||||
|
|
||||||
|
System.out.println(Files.size(wordsFile));
|
||||||
|
System.out.println(Files.size(docsFile));
|
||||||
|
|
||||||
|
var docsArray = LongArray.mmapRead(docsFile);
|
||||||
|
var wordsArray = LongArray.mmapRead(wordsFile);
|
||||||
|
|
||||||
|
var docsHeader = BTreeReader.readHeader(docsArray, 0);
|
||||||
|
var wordsHeader = BTreeReader.readHeader(wordsArray, 0);
|
||||||
|
|
||||||
|
assertEquals(1, docsHeader.numEntries());
|
||||||
|
assertEquals(1, wordsHeader.numEntries());
|
||||||
|
|
||||||
|
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||||
|
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
||||||
|
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||||
|
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFinalizeSimple2x2() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryDataWithWordMeta(100, 101, wm(50, 51)),
|
||||||
|
new EntryDataWithWordMeta(101, 101, wm(51, 52))
|
||||||
|
);
|
||||||
|
|
||||||
|
var preindex = ReversePreindex.constructPreindex(reader, tempDir, tempDir);
|
||||||
|
|
||||||
|
preindex.finalizeIndex(tempDir.resolve( "docs.dat"), tempDir.resolve("words.dat"));
|
||||||
|
preindex.delete();
|
||||||
|
|
||||||
|
Path wordsFile = tempDir.resolve("words.dat");
|
||||||
|
Path docsFile = tempDir.resolve("docs.dat");
|
||||||
|
|
||||||
|
assertTrue(Files.exists(wordsFile));
|
||||||
|
assertTrue(Files.exists(docsFile));
|
||||||
|
|
||||||
|
System.out.println(Files.size(wordsFile));
|
||||||
|
System.out.println(Files.size(docsFile));
|
||||||
|
|
||||||
|
var docsArray = LongArray.mmapRead(docsFile);
|
||||||
|
var wordsArray = LongArray.mmapRead(wordsFile);
|
||||||
|
|
||||||
|
|
||||||
|
var wordsHeader = BTreeReader.readHeader(wordsArray, 0);
|
||||||
|
|
||||||
|
System.out.println(wordsHeader);
|
||||||
|
|
||||||
|
assertEquals(2, wordsHeader.numEntries());
|
||||||
|
|
||||||
|
long offset1 = wordsArray.get(wordsHeader.dataOffsetLongs() + 1);
|
||||||
|
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
|
||||||
|
|
||||||
|
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||||
|
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
||||||
|
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||||
|
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
||||||
|
|
||||||
|
BTreeHeader docsHeader;
|
||||||
|
|
||||||
|
docsHeader = BTreeReader.readHeader(docsArray, offset1);
|
||||||
|
System.out.println(docsHeader);
|
||||||
|
assertEquals(1, docsHeader.numEntries());
|
||||||
|
|
||||||
|
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||||
|
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
||||||
|
|
||||||
|
docsHeader = BTreeReader.readHeader(docsArray, offset2);
|
||||||
|
System.out.println(docsHeader);
|
||||||
|
assertEquals(1, docsHeader.numEntries());
|
||||||
|
|
||||||
|
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||||
|
assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,427 @@
|
|||||||
|
|
||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
class ReversePreindexMergeTest {
|
||||||
|
TestJournalFactory journalFactory;
|
||||||
|
Path countsFile;
|
||||||
|
Path wordsIdFile;
|
||||||
|
Path docsFile;
|
||||||
|
Path tempDir;
|
||||||
|
SortingContext sortingContext;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
journalFactory = new TestJournalFactory();
|
||||||
|
|
||||||
|
countsFile = Files.createTempFile("counts", ".dat");
|
||||||
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
|
tempDir = Files.createTempDirectory("sort");
|
||||||
|
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
journalFactory.clear();
|
||||||
|
|
||||||
|
Files.deleteIfExists(countsFile);
|
||||||
|
Files.deleteIfExists(wordsIdFile);
|
||||||
|
List<Path> contents = new ArrayList<>();
|
||||||
|
Files.list(tempDir).forEach(contents::add);
|
||||||
|
for (var tempFile : contents) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public ReversePreindex runMergeScenario(
|
||||||
|
List<EntryDataWithWordMeta> leftData,
|
||||||
|
List<EntryDataWithWordMeta> rightData
|
||||||
|
) throws IOException {
|
||||||
|
var reader1 = journalFactory.createReader(leftData.toArray(EntryDataWithWordMeta[]::new));
|
||||||
|
var reader2 = journalFactory.createReader(rightData.toArray(EntryDataWithWordMeta[]::new));
|
||||||
|
|
||||||
|
var left = ReversePreindex.constructPreindex(reader1, tempDir, tempDir);
|
||||||
|
var right = ReversePreindex.constructPreindex(reader2, tempDir, tempDir);
|
||||||
|
return ReversePreindex.merge(tempDir, left, right);
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<TestSegmentData> getData(ReversePreindex merged) {
|
||||||
|
var iter = merged.segments.iterator(2);
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
while (iter.next()) {
|
||||||
|
long[] data = new long[(int) (iter.endOffset - iter.startOffset)];
|
||||||
|
merged.documents.slice(iter.startOffset, iter.endOffset).get(0, data);
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset,
|
||||||
|
data));
|
||||||
|
}
|
||||||
|
return actual;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDocsMergeSingleNoOverlap() throws IOException {
|
||||||
|
|
||||||
|
IdSequence docIds = new IdSequence();
|
||||||
|
IdSequence docMetas = new IdSequence();
|
||||||
|
IdSequence wordMetas = new IdSequence();
|
||||||
|
IdSequence wordIds = new IdSequence();
|
||||||
|
|
||||||
|
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
|
||||||
|
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
|
||||||
|
|
||||||
|
var merged = runMergeScenario(
|
||||||
|
leftSequence,
|
||||||
|
rightSequence
|
||||||
|
);
|
||||||
|
|
||||||
|
var actual = getData(merged);
|
||||||
|
|
||||||
|
var expected = simulateMerge(leftSequence, rightSequence);
|
||||||
|
|
||||||
|
System.out.println(actual);
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDocsMergeSingleOnlyOverlap() throws IOException {
|
||||||
|
|
||||||
|
IdSequence docIds = new IdSequence();
|
||||||
|
IdSequence docMetas = new IdSequence();
|
||||||
|
IdSequence wordMetas = new IdSequence();
|
||||||
|
IdSequence wordIds = new IdSequence();
|
||||||
|
|
||||||
|
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.nextUnique(), wordMetas.nextUnique())));
|
||||||
|
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(), wm(wordIds.alreadySeenSameSequence(), wordMetas.nextUnique())));
|
||||||
|
|
||||||
|
var merged = runMergeScenario(
|
||||||
|
leftSequence,
|
||||||
|
rightSequence
|
||||||
|
);
|
||||||
|
|
||||||
|
var actual = getData(merged);
|
||||||
|
|
||||||
|
var expected = simulateMerge(leftSequence, rightSequence);
|
||||||
|
|
||||||
|
System.out.println(actual);
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDocsMergeSingleOnlyOverlap2() throws IOException {
|
||||||
|
|
||||||
|
long wid1 = 1;
|
||||||
|
long wid2 = 2;
|
||||||
|
IdSequence docIds = new IdSequence();
|
||||||
|
IdSequence docMetas = new IdSequence();
|
||||||
|
IdSequence wordMetas = new IdSequence();
|
||||||
|
|
||||||
|
var leftSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
|
||||||
|
wm(wid1, wordMetas.nextUnique()),
|
||||||
|
wm(wid2, wordMetas.nextUnique())
|
||||||
|
));
|
||||||
|
var rightSequence = List.of(new EntryDataWithWordMeta(docIds.nextUnique(), docMetas.nextUnique(),
|
||||||
|
wm(wid1, wordMetas.nextUnique()),
|
||||||
|
wm(wid2, wordMetas.nextUnique())
|
||||||
|
));
|
||||||
|
|
||||||
|
var merged = runMergeScenario(
|
||||||
|
leftSequence,
|
||||||
|
rightSequence
|
||||||
|
);
|
||||||
|
|
||||||
|
var actual = getData(merged);
|
||||||
|
|
||||||
|
var expected = simulateMerge(leftSequence, rightSequence);
|
||||||
|
|
||||||
|
System.out.println(actual);
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadCase1() throws IOException {
|
||||||
|
long wordId = 0xF00F00BA3L;
|
||||||
|
|
||||||
|
List<EntryDataWithWordMeta> leftSequence = List.of(new EntryDataWithWordMeta(40, 50,
|
||||||
|
wm(wordId, 5))
|
||||||
|
);
|
||||||
|
List<EntryDataWithWordMeta> rightSequence = List.of(new EntryDataWithWordMeta(41, 51,
|
||||||
|
wm(wordId, 3),
|
||||||
|
wm(wordId, 4))
|
||||||
|
);
|
||||||
|
|
||||||
|
var mergedLR = runMergeScenario(
|
||||||
|
leftSequence,
|
||||||
|
rightSequence
|
||||||
|
);
|
||||||
|
var mergedRL = runMergeScenario(
|
||||||
|
rightSequence,
|
||||||
|
leftSequence
|
||||||
|
);
|
||||||
|
|
||||||
|
var actualLR = getData(mergedLR);
|
||||||
|
var actualRL = getData(mergedRL);
|
||||||
|
|
||||||
|
var expected = simulateMerge(leftSequence, rightSequence);
|
||||||
|
|
||||||
|
assertEquals(actualLR, actualRL);
|
||||||
|
|
||||||
|
if (!expected.equals(actualLR)) {
|
||||||
|
System.out.println("*fail*");
|
||||||
|
System.out.println(leftSequence);
|
||||||
|
System.out.println(rightSequence);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println("*pass*");
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actualLR);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testBadCase2() throws IOException {
|
||||||
|
long wordId = 100;
|
||||||
|
|
||||||
|
List<EntryDataWithWordMeta> leftSequence = List.of(
|
||||||
|
new EntryDataWithWordMeta(1, 50, wm(wordId, 5)),
|
||||||
|
new EntryDataWithWordMeta(2, 50, wm(wordId, 5))
|
||||||
|
|
||||||
|
);
|
||||||
|
List<EntryDataWithWordMeta> rightSequence = List.of(
|
||||||
|
new EntryDataWithWordMeta(3, 50, wm(wordId, 5))
|
||||||
|
);
|
||||||
|
|
||||||
|
var mergedLR = runMergeScenario(
|
||||||
|
leftSequence,
|
||||||
|
rightSequence
|
||||||
|
);
|
||||||
|
var mergedRL = runMergeScenario(
|
||||||
|
rightSequence,
|
||||||
|
leftSequence
|
||||||
|
);
|
||||||
|
|
||||||
|
var actualLR = getData(mergedLR);
|
||||||
|
var actualRL = getData(mergedRL);
|
||||||
|
|
||||||
|
var expected = simulateMerge(leftSequence, rightSequence);
|
||||||
|
|
||||||
|
assertEquals(actualLR, actualRL);
|
||||||
|
|
||||||
|
if (!expected.equals(actualLR)) {
|
||||||
|
System.out.println("*fail*");
|
||||||
|
System.out.println(leftSequence);
|
||||||
|
System.out.println(rightSequence);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println("*pass*");
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actualLR);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testFuzz() throws IOException {
|
||||||
|
Random r = new Random();
|
||||||
|
int maxDocs = 150;
|
||||||
|
int maxWords = 160;
|
||||||
|
int nIters = 1000;
|
||||||
|
|
||||||
|
for (int i = 0; i < nIters; i++) {
|
||||||
|
int nLeft = 1 + r.nextInt(maxDocs);
|
||||||
|
int nRight = 1 + r.nextInt(maxDocs);
|
||||||
|
|
||||||
|
IdSequence docIdsLeft = new IdSequence();
|
||||||
|
IdSequence docIdsRight = new IdSequence();
|
||||||
|
IdSequence docMetas = new IdSequence();
|
||||||
|
IdSequence wordMetas = new IdSequence();
|
||||||
|
IdSequence wordIds = new IdSequence();
|
||||||
|
|
||||||
|
List<EntryDataWithWordMeta> leftSequence = new ArrayList<>(nLeft);
|
||||||
|
for (int j = 0; j < nLeft; j++) {
|
||||||
|
WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
|
||||||
|
Arrays.setAll(words, idx -> {
|
||||||
|
long wordId = wordIds.seenWithP(1.0);
|
||||||
|
long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
|
||||||
|
return wm(wordId, wordMeta);
|
||||||
|
});
|
||||||
|
|
||||||
|
long docId = docIdsLeft.nextUnique();
|
||||||
|
long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
|
||||||
|
leftSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
|
||||||
|
}
|
||||||
|
|
||||||
|
List<EntryDataWithWordMeta> rightSequence = new ArrayList<>(nLeft);
|
||||||
|
for (int j = 0; j < nRight; j++) {
|
||||||
|
WordWithMeta[] words = new WordWithMeta[maxWords == 1 ? 1 : r.nextInt(1, maxWords)];
|
||||||
|
Arrays.setAll(words, idx -> {
|
||||||
|
long wordId = wordIds.seenWithP(1.0);
|
||||||
|
long wordMeta = wordMetas.nextUniqueAssociatedWithKey(wordId);
|
||||||
|
return wm(wordId, wordMeta);
|
||||||
|
});
|
||||||
|
|
||||||
|
long docId = docIdsRight.seenWithP(docIdsLeft, 0.1);
|
||||||
|
long docMeta = docMetas.nextUniqueAssociatedWithKey(docId);
|
||||||
|
rightSequence.add(new EntryDataWithWordMeta(docId, docMeta, words));
|
||||||
|
}
|
||||||
|
|
||||||
|
var mergedLR = runMergeScenario(
|
||||||
|
leftSequence,
|
||||||
|
rightSequence
|
||||||
|
);
|
||||||
|
var mergedRL = runMergeScenario(
|
||||||
|
rightSequence,
|
||||||
|
leftSequence
|
||||||
|
);
|
||||||
|
|
||||||
|
var actualLR = getData(mergedLR);
|
||||||
|
var actualRL = getData(mergedRL);
|
||||||
|
|
||||||
|
var expected = simulateMerge(leftSequence, rightSequence);
|
||||||
|
|
||||||
|
assertEquals(actualLR, actualRL);
|
||||||
|
|
||||||
|
if (!expected.equals(actualLR)) {
|
||||||
|
System.out.println("*fail*");
|
||||||
|
System.out.println(leftSequence);
|
||||||
|
System.out.println(rightSequence);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
System.out.println("*pass*");
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actualLR);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public List<TestSegmentData> simulateMerge(
|
||||||
|
Collection<EntryDataWithWordMeta> leftInputs,
|
||||||
|
Collection<EntryDataWithWordMeta> rightInputs
|
||||||
|
) {
|
||||||
|
TreeMap<Long, List<DocWithMeta>> wordToDocs = new TreeMap<>();
|
||||||
|
|
||||||
|
for (var entry : leftInputs) {
|
||||||
|
for (var wm : entry.wordIds()) {
|
||||||
|
wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
|
||||||
|
new DocWithMeta(entry.docId(), wm.meta())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (var entry : rightInputs) {
|
||||||
|
for (var wm : entry.wordIds()) {
|
||||||
|
wordToDocs.computeIfAbsent(wm.wordId(), w -> new ArrayList<>()).add(
|
||||||
|
new DocWithMeta(entry.docId(), wm.meta())
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
List<TestSegmentData> ret = new ArrayList<>();
|
||||||
|
int[] start = new int[1];
|
||||||
|
wordToDocs.forEach((wordId, docsList) -> {
|
||||||
|
docsList.sort(Comparator.naturalOrder());
|
||||||
|
var iter = docsList.iterator();
|
||||||
|
DocWithMeta prevVal = null;
|
||||||
|
DocWithMeta currentVal;
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
currentVal = iter.next();
|
||||||
|
if (prevVal != null) {
|
||||||
|
if (currentVal.docId == prevVal.docId) {
|
||||||
|
iter.remove();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
prevVal = currentVal;
|
||||||
|
|
||||||
|
}
|
||||||
|
long[] data = new long[docsList.size()*2];
|
||||||
|
for (int i = 0; i < docsList.size(); i++) {
|
||||||
|
data[2*i] = docsList.get(i).docId;
|
||||||
|
data[2*i + 1] = docsList.get(i).meta;
|
||||||
|
}
|
||||||
|
ret.add(new TestSegmentData(wordId, start[0], start[0] + data.length, data));
|
||||||
|
|
||||||
|
start[0] += data.length;
|
||||||
|
});
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
record DocWithMeta(long docId, long meta) implements Comparable<DocWithMeta> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(DocWithMeta o) {
|
||||||
|
return Long.compare(docId, o.docId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class IdSequence {
|
||||||
|
Set<Long> seen = new HashSet<>();
|
||||||
|
Map<Long, Long> associatedValues = new HashMap<>();
|
||||||
|
private Random random = new Random();
|
||||||
|
|
||||||
|
/** Return alreadySeen() with probability p,
|
||||||
|
* else nextUnique()
|
||||||
|
*/
|
||||||
|
public long seenWithP(double p) {
|
||||||
|
if (isEmpty() || random.nextDouble() > p)
|
||||||
|
return nextUnique();
|
||||||
|
|
||||||
|
return alreadySeenSameSequence();
|
||||||
|
}
|
||||||
|
|
||||||
|
public long seenWithP(IdSequence other, double p) {
|
||||||
|
if (isEmpty() || random.nextDouble() > p)
|
||||||
|
return nextUnique();
|
||||||
|
|
||||||
|
return alreadySeenOtherSequence(other);
|
||||||
|
}
|
||||||
|
|
||||||
|
public long nextUnique() {
|
||||||
|
for (;;) {
|
||||||
|
long val = random.nextLong();
|
||||||
|
if (seen.add(val)) {
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public long nextUniqueAssociatedWithKey(long key) {
|
||||||
|
return associatedValues.computeIfAbsent(key, k -> nextUnique());
|
||||||
|
}
|
||||||
|
|
||||||
|
public long alreadySeenSameSequence() {
|
||||||
|
long[] values = seen.stream().mapToLong(Long::longValue).toArray();
|
||||||
|
int idx = random.nextInt(0, values.length);
|
||||||
|
return values[idx];
|
||||||
|
}
|
||||||
|
|
||||||
|
public long alreadySeenOtherSequence(IdSequence other) {
|
||||||
|
List<Long> values = new ArrayList<>(other.seen);
|
||||||
|
Collections.shuffle(values);
|
||||||
|
for (Long maybe : values) {
|
||||||
|
if (seen.add(maybe))
|
||||||
|
return maybe;
|
||||||
|
}
|
||||||
|
return nextUnique();
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean isEmpty() {
|
||||||
|
return seen.isEmpty();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,234 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.array.LongArray;
|
||||||
|
import nu.marginalia.array.algo.SortingContext;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import static nu.marginalia.index.construction.TestJournalFactory.*;
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class ReversePreindexWordSegmentsTest {
|
||||||
|
Path countsFile;
|
||||||
|
Path wordsIdFile;
|
||||||
|
Path docsFile;
|
||||||
|
Path tempDir;
|
||||||
|
|
||||||
|
TestJournalFactory journalFactory;
|
||||||
|
SortingContext sortingContext;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
journalFactory = new TestJournalFactory();
|
||||||
|
|
||||||
|
countsFile = Files.createTempFile("counts", ".dat");
|
||||||
|
wordsIdFile = Files.createTempFile("words", ".dat");
|
||||||
|
docsFile = Files.createTempFile("docs", ".dat");
|
||||||
|
tempDir = Files.createTempDirectory("sort");
|
||||||
|
sortingContext = new SortingContext(Path.of("invalid"), 1<<20);
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
journalFactory.clear();
|
||||||
|
|
||||||
|
Files.deleteIfExists(countsFile);
|
||||||
|
Files.deleteIfExists(wordsIdFile);
|
||||||
|
List<Path> contents = new ArrayList<>();
|
||||||
|
Files.list(tempDir).forEach(contents::add);
|
||||||
|
for (var tempFile : contents) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testWordSegmentsLongWordId() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 1L<<33)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(1L<<33, 0, 1)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
@Test
|
||||||
|
public void testWordSegmentsRepeatedWordId() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 5, 5)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(5, 0, 2)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordSegments1() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(-100, 0, 1),
|
||||||
|
new TestSegmentData(10, 1, 2),
|
||||||
|
new TestSegmentData(33, 2, 3),
|
||||||
|
new TestSegmentData(40, 3, 4)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordSegments2() throws IOException {
|
||||||
|
var reader = journalFactory.createReader(
|
||||||
|
new EntryData(-0xF00BA3L, 0, 10, 40, -100, 33),
|
||||||
|
new EntryData(0xF00BA4L, 0, 15, 30, -100, 33)
|
||||||
|
);
|
||||||
|
|
||||||
|
var segments = ReversePreindexWordSegments.construct(reader, sortingContext, wordsIdFile, countsFile);
|
||||||
|
var iter = segments.iterator(1);
|
||||||
|
|
||||||
|
List<TestSegmentData> expected = List.of(
|
||||||
|
new TestSegmentData(-100, 0, 2),
|
||||||
|
new TestSegmentData(10, 2, 3),
|
||||||
|
new TestSegmentData(15, 3, 4),
|
||||||
|
new TestSegmentData(30, 4, 5),
|
||||||
|
new TestSegmentData(33, 5, 7),
|
||||||
|
new TestSegmentData(40, 7, 8)
|
||||||
|
);
|
||||||
|
|
||||||
|
List<TestSegmentData> actual = new ArrayList<>();
|
||||||
|
|
||||||
|
while (iter.next()) {
|
||||||
|
actual.add(new TestSegmentData(iter.wordId, iter.startOffset, iter.endOffset));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertEquals(expected, actual);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordSegments_ReadIterator() {
|
||||||
|
LongArray wordsArray = LongArray.allocate(4);
|
||||||
|
LongArray countsArray = LongArray.allocate(4);
|
||||||
|
wordsArray.set(0, -1, -2, -3, -4);
|
||||||
|
countsArray.set(0, 2, 1, 3, 5);
|
||||||
|
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
|
||||||
|
|
||||||
|
var ritr = segments.iterator(1);
|
||||||
|
assertTrue(ritr.hasMorePositions());
|
||||||
|
assertTrue(ritr.next());
|
||||||
|
assertTrue(ritr.isPositionBeforeEnd());
|
||||||
|
assertEquals(-1, ritr.wordId);
|
||||||
|
assertEquals(0, ritr.idx());
|
||||||
|
assertEquals(0, ritr.startOffset);
|
||||||
|
assertEquals(2, ritr.endOffset);
|
||||||
|
|
||||||
|
assertTrue(ritr.hasMorePositions());
|
||||||
|
assertTrue(ritr.next());
|
||||||
|
assertTrue(ritr.isPositionBeforeEnd());
|
||||||
|
assertEquals(-2, ritr.wordId);
|
||||||
|
assertEquals(1, ritr.idx());
|
||||||
|
assertEquals(2, ritr.startOffset);
|
||||||
|
assertEquals(3, ritr.endOffset);
|
||||||
|
|
||||||
|
assertTrue(ritr.hasMorePositions());
|
||||||
|
assertTrue(ritr.next());
|
||||||
|
assertTrue(ritr.isPositionBeforeEnd());
|
||||||
|
assertEquals(-3, ritr.wordId);
|
||||||
|
assertEquals(2, ritr.idx());
|
||||||
|
assertEquals(3, ritr.startOffset);
|
||||||
|
assertEquals(6, ritr.endOffset);
|
||||||
|
|
||||||
|
assertTrue(ritr.hasMorePositions());
|
||||||
|
assertTrue(ritr.next());
|
||||||
|
assertTrue(ritr.isPositionBeforeEnd());
|
||||||
|
assertEquals(-4, ritr.wordId);
|
||||||
|
assertEquals(3, ritr.idx());
|
||||||
|
assertEquals(6, ritr.startOffset);
|
||||||
|
assertEquals(11, ritr.endOffset);
|
||||||
|
|
||||||
|
assertFalse(ritr.hasMorePositions());
|
||||||
|
assertFalse(ritr.next());
|
||||||
|
assertFalse(ritr.isPositionBeforeEnd());
|
||||||
|
|
||||||
|
assertEquals(Long.MIN_VALUE, ritr.wordId);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testWordSegments_ConstructionIterator() {
|
||||||
|
LongArray wordsArray = LongArray.allocate(4);
|
||||||
|
LongArray countsArray = LongArray.allocate(4);
|
||||||
|
wordsArray.set(0, -1, -2, -3, -4);
|
||||||
|
var segments = new ReversePreindexWordSegments(wordsArray, countsArray, null, null);
|
||||||
|
|
||||||
|
var citr = segments.constructionIterator(1);
|
||||||
|
assertEquals(-1, citr.wordId);
|
||||||
|
assertEquals(0, citr.idx());
|
||||||
|
assertTrue(citr.canPutMore());
|
||||||
|
assertTrue(citr.putNext(1));
|
||||||
|
assertEquals(1, countsArray.get(0));
|
||||||
|
|
||||||
|
assertEquals(-2, citr.wordId);
|
||||||
|
assertEquals(1, citr.idx());
|
||||||
|
assertTrue(citr.canPutMore());
|
||||||
|
assertTrue(citr.putNext(2));
|
||||||
|
assertEquals(2, countsArray.get(1));
|
||||||
|
|
||||||
|
assertEquals(-3, citr.wordId);
|
||||||
|
assertEquals(2, citr.idx());
|
||||||
|
assertTrue(citr.canPutMore());
|
||||||
|
assertTrue(citr.putNext(3));
|
||||||
|
assertEquals(3, countsArray.get(2));
|
||||||
|
|
||||||
|
assertEquals(-4, citr.wordId);
|
||||||
|
assertEquals(3, citr.idx());
|
||||||
|
assertTrue(citr.canPutMore());
|
||||||
|
assertFalse(citr.putNext(4));
|
||||||
|
assertEquals(4, countsArray.get(3));
|
||||||
|
|
||||||
|
assertEquals(4, citr.idx());
|
||||||
|
assertFalse(citr.canPutMore());
|
||||||
|
assertEquals(Long.MIN_VALUE, citr.wordId);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,93 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
||||||
|
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
public class TestJournalFactory {
|
||||||
|
Path tempDir = Files.createTempDirectory("journal");
|
||||||
|
|
||||||
|
public TestJournalFactory() throws IOException {}
|
||||||
|
|
||||||
|
public void clear() throws IOException {
|
||||||
|
List<Path> toDelete = new ArrayList<>();
|
||||||
|
try (var dirStream = Files.list(tempDir)) {
|
||||||
|
dirStream.forEach(toDelete::add);
|
||||||
|
}
|
||||||
|
for (var tempFile : toDelete) {
|
||||||
|
Files.delete(tempFile);
|
||||||
|
}
|
||||||
|
Files.delete(tempDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
public record EntryData(long docId, long docMeta, long... wordIds) {
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "EntryData{" +
|
||||||
|
"docId=" + docId +
|
||||||
|
", docMeta=" + docMeta +
|
||||||
|
", wordIds=" + Arrays.toString(wordIds) +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public record EntryDataWithWordMeta(long docId, long docMeta, WordWithMeta... wordIds) {
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "EntryDataWithWordMeta{" +
|
||||||
|
"docId=" + docId +
|
||||||
|
", docMeta=" + docMeta +
|
||||||
|
", wordIds=" + Arrays.toString(wordIds) +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
||||||
|
public record WordWithMeta(long wordId, long meta) {}
|
||||||
|
|
||||||
|
public static WordWithMeta wm(long wordId, long meta) {
|
||||||
|
return new WordWithMeta(wordId, meta);
|
||||||
|
}
|
||||||
|
|
||||||
|
IndexJournalReader createReader(EntryData... entries) throws IOException {
|
||||||
|
Path jf = Files.createTempFile(tempDir, "journal", ".dat");
|
||||||
|
|
||||||
|
var writer = new IndexJournalWriterSingleFileImpl(jf);
|
||||||
|
for (var entry : entries) {
|
||||||
|
long[] data = new long[entry.wordIds.length * 2];
|
||||||
|
for (int i = 0; i < entry.wordIds.length; i++)
|
||||||
|
data[i*2] = entry.wordIds[i];
|
||||||
|
|
||||||
|
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
||||||
|
new IndexJournalEntryData(data));
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
var ret = new IndexJournalReaderSingleCompressedFile(jf);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public IndexJournalReader createReader(EntryDataWithWordMeta... entries) throws IOException {
|
||||||
|
Path jf = Files.createTempFile(tempDir, "journal", ".dat");
|
||||||
|
|
||||||
|
var writer = new IndexJournalWriterSingleFileImpl(jf);
|
||||||
|
for (var entry : entries) {
|
||||||
|
long[] data = new long[entry.wordIds.length * 2];
|
||||||
|
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||||
|
data[i * 2] = entry.wordIds[i].wordId;
|
||||||
|
data[i * 2 + 1] = entry.wordIds[i].meta;
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
||||||
|
new IndexJournalEntryData(data));
|
||||||
|
}
|
||||||
|
writer.close();
|
||||||
|
var ret = new IndexJournalReaderSingleCompressedFile(jf);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,41 @@
|
|||||||
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
record TestSegmentData(long wordId, long start, long end, long[] data) {
|
||||||
|
public TestSegmentData(long wordId, long start, long end) {
|
||||||
|
this(wordId, start, end, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object o) {
|
||||||
|
if (this == o) return true;
|
||||||
|
if (o == null || getClass() != o.getClass()) return false;
|
||||||
|
|
||||||
|
TestSegmentData that = (TestSegmentData) o;
|
||||||
|
|
||||||
|
if (wordId != that.wordId) return false;
|
||||||
|
if (start != that.start) return false;
|
||||||
|
if (end != that.end) return false;
|
||||||
|
return Arrays.equals(data, that.data);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
int result = (int) (wordId ^ (wordId >>> 32));
|
||||||
|
result = 31 * result + (int) (start ^ (start >>> 32));
|
||||||
|
result = 31 * result + (int) (end ^ (end >>> 32));
|
||||||
|
result = 31 * result + Arrays.hashCode(data);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "TestSegmentData{" +
|
||||||
|
"wordId=" + wordId +
|
||||||
|
", start=" + start +
|
||||||
|
", end=" + end +
|
||||||
|
", data=" + Arrays.toString(data) +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
@ -1,146 +0,0 @@
|
|||||||
package nu.marginalia.index.reverse;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullConverter;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntry;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
|
||||||
import nu.marginalia.process.control.ProcessTaskHeartbeat;
|
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
|
||||||
import nu.marginalia.test.TestUtil;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.mockito.Mockito;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertArrayEquals;
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
class ReverseIndexFullConverterTest {
|
|
||||||
KeywordLexicon keywordLexicon;
|
|
||||||
|
|
||||||
Path indexFile;
|
|
||||||
Path wordsFile1;
|
|
||||||
Path urlsFile1;
|
|
||||||
Path dictionaryFile;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
@SneakyThrows
|
|
||||||
void setUp() {
|
|
||||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
|
||||||
dictionaryFile.toFile().deleteOnExit();
|
|
||||||
|
|
||||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE));
|
|
||||||
keywordLexicon.getOrInsert("0");
|
|
||||||
|
|
||||||
indexFile = Files.createTempFile("tmp", ".idx");
|
|
||||||
indexFile.toFile().deleteOnExit();
|
|
||||||
|
|
||||||
|
|
||||||
wordsFile1 = Files.createTempFile("words1", ".idx");
|
|
||||||
urlsFile1 = Files.createTempFile("urls1", ".idx");
|
|
||||||
}
|
|
||||||
|
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
|
||||||
|
|
||||||
var entryBuilder = IndexJournalEntry.builder(id, DocumentMetadata.defaultValue());
|
|
||||||
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
entryBuilder.add(keywordLexicon.getOrInsert(Integer.toString(factors[i])), -factors[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.put(entryBuilder.build());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testReverseIndex() throws IOException {
|
|
||||||
var writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
|
|
||||||
|
|
||||||
for (int i = 1; i < 512; i++) {
|
|
||||||
createEntry(writer, keywordLexicon, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
|
|
||||||
Path tmpDir = Path.of("/tmp");
|
|
||||||
Path dataDir = Files.createTempDirectory(getClass().getSimpleName());
|
|
||||||
|
|
||||||
var wordsFile = dataDir.resolve("urls.dat");
|
|
||||||
var docsFile = dataDir.resolve("docs.dat");
|
|
||||||
var journalReader = new IndexJournalReaderSingleCompressedFile(indexFile);
|
|
||||||
|
|
||||||
// RIP fairies
|
|
||||||
var processHeartbeat = Mockito.mock(ProcessHeartbeat.class);
|
|
||||||
when(processHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
|
||||||
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
|
||||||
|
|
||||||
new ReverseIndexFullConverter(
|
|
||||||
processHeartbeat,
|
|
||||||
tmpDir, journalReader, new DomainRankings(), wordsFile, docsFile)
|
|
||||||
.convert();
|
|
||||||
|
|
||||||
var reverseIndexReader = new ReverseIndexFullReader(wordsFile, docsFile);
|
|
||||||
|
|
||||||
System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("1")));
|
|
||||||
System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("2")));
|
|
||||||
System.out.println(reverseIndexReader.numDocuments(keywordLexicon.getReadOnly("3")));
|
|
||||||
|
|
||||||
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 1));
|
|
||||||
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 1));
|
|
||||||
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 2));
|
|
||||||
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 2));
|
|
||||||
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("1"), 3));
|
|
||||||
System.out.println(reverseIndexReader.isWordInDoc(keywordLexicon.getReadOnly("2"), 3));
|
|
||||||
|
|
||||||
var buffer = new LongQueryBuffer(32);
|
|
||||||
reverseIndexReader.documents(keywordLexicon.getReadOnly("1")).read(buffer);
|
|
||||||
assertArrayEquals(LongStream.range(1, 17).map(this::addMaxRank).toArray(), buffer.copyData());
|
|
||||||
System.out.println(buffer);
|
|
||||||
|
|
||||||
buffer.reset();
|
|
||||||
reverseIndexReader.documents(keywordLexicon.getReadOnly("2")).read(buffer);
|
|
||||||
assertArrayEquals(LongStream.range(1, 17).map(v -> v*2).map(this::addMaxRank).toArray(), buffer.copyData());
|
|
||||||
System.out.println(buffer);
|
|
||||||
|
|
||||||
buffer.reset();
|
|
||||||
reverseIndexReader.documents(keywordLexicon.getReadOnly("3")).read(buffer);
|
|
||||||
assertArrayEquals(LongStream.range(1, 17).map(v -> v*3).map(this::addMaxRank).toArray(), buffer.copyData());
|
|
||||||
System.out.println(buffer);
|
|
||||||
|
|
||||||
buffer.reset();
|
|
||||||
var es = reverseIndexReader.documents(keywordLexicon.getReadOnly("7"));
|
|
||||||
do {
|
|
||||||
buffer.reset();
|
|
||||||
es.read(buffer);
|
|
||||||
System.out.println(buffer);
|
|
||||||
} while (es.hasMore());
|
|
||||||
|
|
||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add a max domain rank component to the input, when interpreted as an ID
|
|
||||||
private long addMaxRank(long in) {
|
|
||||||
return UrlIdCodec.addRank(1f, in);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,178 +0,0 @@
|
|||||||
package nu.marginalia.index.reverse;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullConverter;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
|
||||||
import nu.marginalia.process.control.ProcessTaskHeartbeat;
|
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.test.TestUtil;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.mockito.Mockito;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
class ReverseIndexFullConverterTest2 {
|
|
||||||
|
|
||||||
KeywordLexicon keywordLexicon;
|
|
||||||
IndexJournalWriter writer;
|
|
||||||
|
|
||||||
Path indexFile;
|
|
||||||
Path wordsFile1;
|
|
||||||
Path urlsFile1;
|
|
||||||
Path dictionaryFile;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
Path dataDir;
|
|
||||||
private Path wordsFile;
|
|
||||||
private Path docsFile;
|
|
||||||
|
|
||||||
int workSetSize = 8192;
|
|
||||||
int workSetStart = 8000;
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
@SneakyThrows
|
|
||||||
void setUp() {
|
|
||||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
|
||||||
dictionaryFile.toFile().deleteOnExit();
|
|
||||||
|
|
||||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE));
|
|
||||||
keywordLexicon.getOrInsert("0");
|
|
||||||
|
|
||||||
indexFile = Files.createTempFile("tmp", ".idx");
|
|
||||||
indexFile.toFile().deleteOnExit();
|
|
||||||
writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
|
|
||||||
|
|
||||||
wordsFile1 = Files.createTempFile("words1", ".idx");
|
|
||||||
urlsFile1 = Files.createTempFile("urls1", ".idx");
|
|
||||||
|
|
||||||
dataDir = Files.createTempDirectory(getClass().getSimpleName());
|
|
||||||
|
|
||||||
for (int i = 1; i < workSetSize; i++) {
|
|
||||||
if (i < workSetStart) {
|
|
||||||
keywordLexicon.getOrInsert(Integer.toString(i));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
createEntry(writer, keywordLexicon, i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
keywordLexicon.commitToDisk();
|
|
||||||
Thread.sleep(1000);
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
var reader = new IndexJournalReaderSingleCompressedFile(indexFile);
|
|
||||||
|
|
||||||
wordsFile = dataDir.resolve("words.dat");
|
|
||||||
docsFile = dataDir.resolve("docs.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterEach
|
|
||||||
public void tearDown() {
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int[] getFactorsI(int id) {
|
|
||||||
return IntStream.rangeClosed(1, id-1).toArray();
|
|
||||||
}
|
|
||||||
public long[] getFactorsL(int id) {
|
|
||||||
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
long createId(int url, int domain) {
|
|
||||||
return UrlIdCodec.encodeId(domain, url);
|
|
||||||
}
|
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
|
||||||
int[] factors = getFactorsI(id);
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
|
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
|
||||||
data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.put(header, new IndexJournalEntryData(data));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testRev2() throws IOException {
|
|
||||||
|
|
||||||
Path tmpDir = Path.of("/tmp");
|
|
||||||
|
|
||||||
var processHeartbeat = Mockito.mock(ProcessHeartbeat.class);
|
|
||||||
when(processHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
|
||||||
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
|
||||||
|
|
||||||
new ReverseIndexFullConverter(processHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert();
|
|
||||||
|
|
||||||
var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile);
|
|
||||||
|
|
||||||
for (int i = workSetStart; i < workSetSize; i++) {
|
|
||||||
|
|
||||||
var es = reverseReader.documents(i);
|
|
||||||
LongQueryBuffer lqb = new LongQueryBuffer(100);
|
|
||||||
while (es.hasMore()) {
|
|
||||||
lqb.reset();
|
|
||||||
es.read(lqb);
|
|
||||||
System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end)));
|
|
||||||
}
|
|
||||||
System.out.println("--");
|
|
||||||
}
|
|
||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testRevP() throws IOException {
|
|
||||||
|
|
||||||
Path tmpDir = Path.of("/tmp");
|
|
||||||
|
|
||||||
var processHeartbeat = Mockito.mock(ProcessHeartbeat.class);
|
|
||||||
when(processHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
|
||||||
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
|
||||||
|
|
||||||
new ReverseIndexFullConverter(processHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert();
|
|
||||||
|
|
||||||
var reverseReader = new ReverseIndexFullReader(wordsFile, docsFile);
|
|
||||||
|
|
||||||
for (int i = workSetStart; i < workSetSize; i++) {
|
|
||||||
|
|
||||||
var es = reverseReader.documents(i);
|
|
||||||
LongQueryBuffer lqb = new LongQueryBuffer(100);
|
|
||||||
while (es.hasMore()) {
|
|
||||||
lqb.reset();
|
|
||||||
es.read(lqb);
|
|
||||||
System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end)));
|
|
||||||
}
|
|
||||||
System.out.println("--");
|
|
||||||
}
|
|
||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,179 +0,0 @@
|
|||||||
package nu.marginalia.index.reverse;
|
|
||||||
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.array.buffer.LongQueryBuffer;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityConverter;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
|
||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
|
||||||
import nu.marginalia.process.control.ProcessTaskHeartbeat;
|
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.test.TestUtil;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.mockito.Mockito;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
class ReverseIndexPriorityConverterTest2 {
|
|
||||||
|
|
||||||
KeywordLexicon keywordLexicon;
|
|
||||||
IndexJournalWriter writer;
|
|
||||||
|
|
||||||
Path indexFile;
|
|
||||||
Path wordsFile1;
|
|
||||||
Path urlsFile1;
|
|
||||||
Path dictionaryFile;
|
|
||||||
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
Path dataDir;
|
|
||||||
private Path wordsFile;
|
|
||||||
private Path docsFile;
|
|
||||||
|
|
||||||
int workSetSize = 8192;
|
|
||||||
int workSetStart = 8000;
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
@SneakyThrows
|
|
||||||
void setUp() {
|
|
||||||
dictionaryFile = Files.createTempFile("tmp", ".dict");
|
|
||||||
dictionaryFile.toFile().deleteOnExit();
|
|
||||||
|
|
||||||
keywordLexicon = new KeywordLexicon(new KeywordLexiconJournal(dictionaryFile.toFile(), KeywordLexiconJournalMode.READ_WRITE));
|
|
||||||
keywordLexicon.getOrInsert("0");
|
|
||||||
|
|
||||||
indexFile = Files.createTempFile("tmp", ".idx");
|
|
||||||
indexFile.toFile().deleteOnExit();
|
|
||||||
writer = new IndexJournalWriterImpl(keywordLexicon, indexFile);
|
|
||||||
|
|
||||||
wordsFile1 = Files.createTempFile("words1", ".idx");
|
|
||||||
urlsFile1 = Files.createTempFile("urls1", ".idx");
|
|
||||||
|
|
||||||
dataDir = Files.createTempDirectory(getClass().getSimpleName());
|
|
||||||
|
|
||||||
for (int i = 1; i < workSetSize; i++) {
|
|
||||||
if (i < workSetStart) {
|
|
||||||
keywordLexicon.getOrInsert(Integer.toString(i));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
createEntry(writer, keywordLexicon, i);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
keywordLexicon.commitToDisk();
|
|
||||||
Thread.sleep(1000);
|
|
||||||
writer.close();
|
|
||||||
|
|
||||||
var reader = new IndexJournalReaderSingleCompressedFile(indexFile);
|
|
||||||
|
|
||||||
wordsFile = dataDir.resolve("words.dat");
|
|
||||||
docsFile = dataDir.resolve("docs.dat");
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterEach
|
|
||||||
public void tearDown() {
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
public int[] getFactorsI(int id) {
|
|
||||||
return IntStream.rangeClosed(1, id-1).toArray();
|
|
||||||
}
|
|
||||||
public long[] getFactorsL(int id) {
|
|
||||||
return LongStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
|
||||||
}
|
|
||||||
|
|
||||||
long createId(int url, int domain) {
|
|
||||||
return UrlIdCodec.encodeId(domain, url);
|
|
||||||
}
|
|
||||||
public void createEntry(IndexJournalWriter writer, KeywordLexicon keywordLexicon, int id) {
|
|
||||||
int[] factors = getFactorsI(id);
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, createId(id, id/20), id % 5);
|
|
||||||
|
|
||||||
long[] data = new long[factors.length*2];
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
data[2*i] = keywordLexicon.getOrInsert(Integer.toString(factors[i]));
|
|
||||||
data[2*i + 1] = (i % 21 != 0) ? 0 : -factors[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
writer.put(header, new IndexJournalEntryData(data));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testRev2() throws IOException {
|
|
||||||
|
|
||||||
Path tmpDir = Path.of("/tmp");
|
|
||||||
|
|
||||||
var processHeartbeat = Mockito.mock(ProcessHeartbeat.class);
|
|
||||||
when(processHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
|
||||||
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
|
||||||
|
|
||||||
new ReverseIndexPriorityConverter(processHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile), new DomainRankings(), wordsFile, docsFile).convert();
|
|
||||||
|
|
||||||
var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile);
|
|
||||||
|
|
||||||
for (int i = workSetStart; i < workSetSize; i++) {
|
|
||||||
|
|
||||||
var es = reverseReader.priorityDocuments(i);
|
|
||||||
LongQueryBuffer lqb = new LongQueryBuffer(100);
|
|
||||||
while (es.hasMore()) {
|
|
||||||
lqb.reset();
|
|
||||||
es.read(lqb);
|
|
||||||
System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end)));
|
|
||||||
}
|
|
||||||
System.out.println("--");
|
|
||||||
}
|
|
||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void testRevP() throws IOException {
|
|
||||||
|
|
||||||
Path tmpDir = Path.of("/tmp");
|
|
||||||
|
|
||||||
|
|
||||||
var processHeartbeat = Mockito.mock(ProcessHeartbeat.class);
|
|
||||||
when(processHeartbeat.createProcessTaskHeartbeat(Mockito.any(), Mockito.any()))
|
|
||||||
.thenReturn(Mockito.mock(ProcessTaskHeartbeat.class));
|
|
||||||
|
|
||||||
new ReverseIndexPriorityConverter(processHeartbeat, tmpDir, new IndexJournalReaderSingleCompressedFile(indexFile, null, ReverseIndexPriorityParameters::filterPriorityRecord), new DomainRankings(), wordsFile, docsFile).convert();
|
|
||||||
|
|
||||||
var reverseReader = new ReverseIndexPriorityReader(wordsFile, docsFile);
|
|
||||||
|
|
||||||
for (int i = workSetStart; i < workSetSize; i++) {
|
|
||||||
|
|
||||||
var es = reverseReader.priorityDocuments(i);
|
|
||||||
LongQueryBuffer lqb = new LongQueryBuffer(100);
|
|
||||||
while (es.hasMore()) {
|
|
||||||
lqb.reset();
|
|
||||||
es.read(lqb);
|
|
||||||
System.out.println(Arrays.toString(Arrays.copyOf(lqb.data, lqb.end)));
|
|
||||||
}
|
|
||||||
System.out.println("--");
|
|
||||||
}
|
|
||||||
|
|
||||||
TestUtil.clearTempDir(dataDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,40 +0,0 @@
|
|||||||
plugins {
|
|
||||||
id 'java'
|
|
||||||
id "io.freefair.lombok" version "8.2.2"
|
|
||||||
|
|
||||||
id 'jvm-test-suite'
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
java {
|
|
||||||
toolchain {
|
|
||||||
languageVersion.set(JavaLanguageVersion.of(20))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
dependencies {
|
|
||||||
|
|
||||||
implementation project(':code:libraries:next-prime')
|
|
||||||
implementation libs.lombok
|
|
||||||
annotationProcessor libs.lombok
|
|
||||||
implementation libs.bundles.slf4j
|
|
||||||
|
|
||||||
implementation libs.prometheus
|
|
||||||
implementation libs.guava
|
|
||||||
implementation libs.fastutil
|
|
||||||
implementation project(':third-party:commons-codec')
|
|
||||||
|
|
||||||
testImplementation libs.bundles.slf4j.test
|
|
||||||
testImplementation libs.bundles.junit
|
|
||||||
testImplementation libs.mockito
|
|
||||||
}
|
|
||||||
|
|
||||||
test {
|
|
||||||
useJUnitPlatform()
|
|
||||||
}
|
|
||||||
|
|
||||||
task fastTests(type: Test) {
|
|
||||||
useJUnitPlatform {
|
|
||||||
excludeTags "slow"
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,19 +0,0 @@
|
|||||||
# Lexicon
|
|
||||||
|
|
||||||
The lexicon contains a mapping for words to identifiers.
|
|
||||||
|
|
||||||
To ease index construction, it makes calculations easier if the domain of word identifiers is dense, that is, there is no gaps between ids; if there are 100 words, they're indexed 0-99 and not 5, 23, 107, 9999, 819235 etc. The lexicon exists to create such a mapping.
|
|
||||||
|
|
||||||
This lexicon is populated from a journal. The actual word data isn't mapped, but rather a 64 bit hash. As a result of the <a href="https://en.wikipedia.org/wiki/Birthday_problem">birthday paradox</a>, colissions will be rare up until about to 2<sup>32</sup> words.
|
|
||||||
|
|
||||||
|
|
||||||
The lexicon is constructed by [processes/loading-process](../../processes/loading-process) and read when
|
|
||||||
[services-core/index-service](../../services-core/index-service) interprets queries.
|
|
||||||
|
|
||||||
## Central Classes
|
|
||||||
|
|
||||||
* [KeywordLexicon](src/main/java/nu/marginalia/lexicon/KeywordLexicon.java)
|
|
||||||
* [KeywordLexiconJournal](src/main/java/nu/marginalia/lexicon/journal/KeywordLexiconJournal.java)
|
|
||||||
* [DictionaryMap](src/main/java/nu/marginalia/dict/DictionaryMap.java) comes in two versions
|
|
||||||
* * [OnHeapDictionaryMap](src/main/java/nu/marginalia/dict/OnHeapDictionaryMap.java) - basically just a fastutil Long2IntOpenHashMap
|
|
||||||
* * [OffHeapDictionaryHashMap](src/main/java/nu/marginalia/dict/OffHeapDictionaryHashMap.java) - a heavily modified trove TLongIntHashMap that uses off heap memory
|
|
@ -1,42 +0,0 @@
|
|||||||
package nu.marginalia.dict;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
|
|
||||||
public class DictionaryData {
|
|
||||||
private final int bankSize;
|
|
||||||
|
|
||||||
private final ArrayList<DictionaryDataBank> banks = new ArrayList<>(100);
|
|
||||||
|
|
||||||
public DictionaryData(int bankSize) {
|
|
||||||
this.bankSize = bankSize;
|
|
||||||
banks.add(new DictionaryDataBank(0, bankSize));
|
|
||||||
}
|
|
||||||
|
|
||||||
public int add(long key) {
|
|
||||||
var activeBank = banks.get(banks.size()-1);
|
|
||||||
int rb = activeBank.add(key);
|
|
||||||
|
|
||||||
if (rb == -1) {
|
|
||||||
int end = activeBank.getEnd();
|
|
||||||
var newBank = new DictionaryDataBank(end, bankSize);
|
|
||||||
rb = newBank.add(key);
|
|
||||||
|
|
||||||
banks.add(newBank);
|
|
||||||
}
|
|
||||||
|
|
||||||
return rb;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public long getKey(int offset) {
|
|
||||||
return banks.get(offset/ bankSize).getKey(offset);
|
|
||||||
}
|
|
||||||
public boolean keyEquals(int offset, long otherKey) {
|
|
||||||
return banks.get(offset/ bankSize).keyEquals(offset, otherKey);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void clear() {
|
|
||||||
banks.clear();
|
|
||||||
banks.add(new DictionaryDataBank(0, bankSize));
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,63 +0,0 @@
|
|||||||
package nu.marginalia.dict;
|
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.LongBuffer;
|
|
||||||
|
|
||||||
class DictionaryDataBank {
|
|
||||||
|
|
||||||
private final int start_idx;
|
|
||||||
|
|
||||||
// Humongous long-lived arrays seem to sometimes yield considerable memory overhead and
|
|
||||||
// can make the GC behave poorly. Using off-heap memory seems preferred when their
|
|
||||||
// lifetime is "forever"
|
|
||||||
|
|
||||||
private final LongBuffer keys;
|
|
||||||
|
|
||||||
private int size;
|
|
||||||
private final int capacity;
|
|
||||||
|
|
||||||
|
|
||||||
public DictionaryDataBank(int start_idx, int sz) {
|
|
||||||
this.start_idx = start_idx;
|
|
||||||
this.capacity = sz;
|
|
||||||
|
|
||||||
keys = ByteBuffer.allocateDirect(8 * capacity).asLongBuffer();
|
|
||||||
size = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getStart() {
|
|
||||||
return start_idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getEnd() {
|
|
||||||
return start_idx + size;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getKey(int idx) {
|
|
||||||
if (idx < start_idx || idx - start_idx >= size) {
|
|
||||||
throw new IndexOutOfBoundsException(idx);
|
|
||||||
}
|
|
||||||
return keys.get(idx - start_idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean keyEquals(int idx, long other) {
|
|
||||||
if (idx < start_idx || idx - start_idx >= size) {
|
|
||||||
throw new IndexOutOfBoundsException(idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
return keys.get(idx - start_idx) == other;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int add(long newKey) {
|
|
||||||
if (size >= capacity)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
keys.put(size, newKey);
|
|
||||||
|
|
||||||
return start_idx + size++;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getSize() {
|
|
||||||
return size;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,27 +0,0 @@
|
|||||||
package nu.marginalia.dict;
|
|
||||||
|
|
||||||
/** Backing store for the KeywordLexicon, available in on and off-heap versions.
|
|
||||||
* <p>
|
|
||||||
* The off-heap version is necessary when loading a lexicon that is too large to fit in RAM, due
|
|
||||||
* to Java's 2GB limit on the size of a single array. It is slower and less optimized than the on-heap version.
|
|
||||||
* <p>
|
|
||||||
* The off-heap version is on the precipice of being deprecated and its use is discouraged.
|
|
||||||
*/
|
|
||||||
public interface DictionaryMap {
|
|
||||||
int NO_VALUE = Integer.MIN_VALUE;
|
|
||||||
|
|
||||||
static DictionaryMap create() {
|
|
||||||
// Default to on-heap version
|
|
||||||
// TODO: Make this configurable
|
|
||||||
|
|
||||||
return new OnHeapDictionaryMap();
|
|
||||||
}
|
|
||||||
|
|
||||||
void clear();
|
|
||||||
|
|
||||||
int size();
|
|
||||||
|
|
||||||
int put(long key);
|
|
||||||
|
|
||||||
int get(long key);
|
|
||||||
}
|
|
@ -1,172 +0,0 @@
|
|||||||
package nu.marginalia.dict;
|
|
||||||
|
|
||||||
import nu.marginalia.util.NextPrimeUtil;
|
|
||||||
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.IntBuffer;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
|
|
||||||
import static java.lang.Math.round;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Spiritually influenced by GNU Trove's hash maps
|
|
||||||
* LGPL 2.1
|
|
||||||
*/
|
|
||||||
public class OffHeapDictionaryHashMap implements DictionaryMap {
|
|
||||||
|
|
||||||
private final int bufferCount;
|
|
||||||
|
|
||||||
private final IntBuffer[] buffers;
|
|
||||||
private final DictionaryData dictionaryData;
|
|
||||||
|
|
||||||
private final long hashTableSize;
|
|
||||||
private final int bufferSizeBytes;
|
|
||||||
private final int intsPerBuffer;
|
|
||||||
private final long maxProbeLength;
|
|
||||||
|
|
||||||
private final AtomicInteger sz = new AtomicInteger(0);
|
|
||||||
|
|
||||||
public OffHeapDictionaryHashMap(long sizeMemory) {
|
|
||||||
final int intSize = 4;
|
|
||||||
|
|
||||||
bufferCount = 1 + (int) ((intSize*sizeMemory) / (1<<30));
|
|
||||||
buffers = new IntBuffer[bufferCount];
|
|
||||||
|
|
||||||
// Actually use a prime size for Donald Knuth reasons
|
|
||||||
hashTableSize = NextPrimeUtil.nextPrime(sizeMemory, -1);
|
|
||||||
|
|
||||||
intsPerBuffer = 1 + (int)(sizeMemory/ bufferCount);
|
|
||||||
bufferSizeBytes = intSize*intsPerBuffer;
|
|
||||||
maxProbeLength = sizeMemory/10;
|
|
||||||
|
|
||||||
if (((long) bufferCount * intsPerBuffer) < sizeMemory) {
|
|
||||||
throw new Error("Buffer memory is less than requested memory; this data structure is not safe to use");
|
|
||||||
}
|
|
||||||
|
|
||||||
dictionaryData = new DictionaryData((int)Math.min(1<<27, Math.max(32L, sizeMemory/4)));
|
|
||||||
|
|
||||||
initializeBuffers();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void initializeBuffers() {
|
|
||||||
for (int b = 0; b < bufferCount; b++) {
|
|
||||||
buffers[b] = ByteBuffer.allocateDirect(bufferSizeBytes).asIntBuffer();
|
|
||||||
|
|
||||||
for (int i = 0; i < intsPerBuffer; i++) {
|
|
||||||
buffers[b].put(i, NO_VALUE);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void clear() {
|
|
||||||
dictionaryData.clear();
|
|
||||||
initializeBuffers();
|
|
||||||
sz.set(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int size() {
|
|
||||||
return sz.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getCell(long idx) {
|
|
||||||
int buffer = (int)(idx / intsPerBuffer);
|
|
||||||
int bufferIdx = (int)(idx % intsPerBuffer);
|
|
||||||
return buffers[buffer].get(bufferIdx);
|
|
||||||
}
|
|
||||||
private void setCell(long idx, int val) {
|
|
||||||
int buffer = (int)(idx / intsPerBuffer);
|
|
||||||
int bufferIdx = (int)(idx % intsPerBuffer);
|
|
||||||
|
|
||||||
buffers[buffer].put(bufferIdx, val);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int put(long key) {
|
|
||||||
|
|
||||||
long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
|
||||||
|
|
||||||
long idx = hash % hashTableSize;
|
|
||||||
|
|
||||||
if (getCell(idx) == NO_VALUE) {
|
|
||||||
return setValue(key, idx);
|
|
||||||
}
|
|
||||||
|
|
||||||
return putRehash(key, idx, hash);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int putRehash(long key, long idx, long hash) {
|
|
||||||
final long pStride = 1 + (hash % (hashTableSize - 2));
|
|
||||||
|
|
||||||
for (long j = 1; j < maxProbeLength; j++) {
|
|
||||||
idx = idx - pStride;
|
|
||||||
|
|
||||||
if (idx < 0) {
|
|
||||||
idx += hashTableSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
final int val = getCell(idx);
|
|
||||||
|
|
||||||
if (val == NO_VALUE) {
|
|
||||||
return setValue(key, idx);
|
|
||||||
}
|
|
||||||
else if (dictionaryData.keyEquals(val, key)) {
|
|
||||||
return val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
|
|
||||||
}
|
|
||||||
|
|
||||||
private int setValue(long key, long cell) {
|
|
||||||
sz.incrementAndGet();
|
|
||||||
|
|
||||||
int di = dictionaryData.add(key);
|
|
||||||
setCell(cell, di);
|
|
||||||
return di;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int get(long key) {
|
|
||||||
final long hash = key & 0x7FFF_FFFF_FFFF_FFFFL;
|
|
||||||
final long cell = hash % hashTableSize;
|
|
||||||
|
|
||||||
if (getCell(cell) == NO_VALUE) {
|
|
||||||
return NO_VALUE;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
int val = getCell(cell);
|
|
||||||
|
|
||||||
if (dictionaryData.keyEquals(val, key)) {
|
|
||||||
return val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return getRehash(key, cell, hash);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int getRehash(long key, long idx, long hash) {
|
|
||||||
final long pStride = 1 + (hash % (hashTableSize - 2));
|
|
||||||
|
|
||||||
for (long j = 1; j < maxProbeLength; j++) {
|
|
||||||
idx = idx - pStride;
|
|
||||||
|
|
||||||
if (idx < 0) {
|
|
||||||
idx += hashTableSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
final var val = getCell(idx);
|
|
||||||
|
|
||||||
if (val == NO_VALUE) {
|
|
||||||
return NO_VALUE;
|
|
||||||
}
|
|
||||||
else if (dictionaryData.keyEquals(val, key)) {
|
|
||||||
return val;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw new IllegalStateException("DictionaryHashMap full @ size " + size() + "/" + hashTableSize + ", " + round((100.0*size()) / hashTableSize) + "%");
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,61 +0,0 @@
|
|||||||
package nu.marginalia.dict;
|
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
|
||||||
|
|
||||||
public class OnHeapDictionaryMap implements DictionaryMap {
|
|
||||||
/* Use three different hash tables to get around the limitations of Java's array sizes.
|
|
||||||
*
|
|
||||||
* Each map fits 0.75 * 2^30 entries (~800mn); the three maps together fit a bit over 2^31 entries.
|
|
||||||
* We're happy with 2^31.
|
|
||||||
*
|
|
||||||
* We'll assign each term to one of the three maps based on their modulo of 3. We'll pray each
|
|
||||||
* night that Long2IntOpenHashMap hash function is good enough to cope with this. The keys we are
|
|
||||||
* inserting are 64 bit hashes already, so odds are the rest of the bits have very good entropy.
|
|
||||||
*/
|
|
||||||
private static final int DEFAULT_SIZE = Integer.getInteger("lexiconSizeHint", 100_000)/3;
|
|
||||||
private final Long2IntOpenHashMap[] entries = new Long2IntOpenHashMap[3];
|
|
||||||
|
|
||||||
public OnHeapDictionaryMap() {
|
|
||||||
for (int i = 0; i < entries.length; i++) {
|
|
||||||
entries[i] = new Long2IntOpenHashMap(DEFAULT_SIZE, 0.75f);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void clear() {
|
|
||||||
for (var map : entries) {
|
|
||||||
map.clear();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int size() {
|
|
||||||
int totalSize = 0;
|
|
||||||
for (var map : entries) {
|
|
||||||
totalSize += map.size();
|
|
||||||
}
|
|
||||||
return totalSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int put(long key) {
|
|
||||||
int shardIdx = (int) Long.remainderUnsigned(key, 3);
|
|
||||||
var shard = entries[shardIdx];
|
|
||||||
int size = size();
|
|
||||||
|
|
||||||
if (size == Integer.MAX_VALUE)
|
|
||||||
throw new IllegalStateException("DictionaryMap is full");
|
|
||||||
|
|
||||||
shard.putIfAbsent(key, size);
|
|
||||||
|
|
||||||
return get(key);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int get(long key) {
|
|
||||||
int shardIdx = (int) Long.remainderUnsigned(key, 3);
|
|
||||||
var shard = entries[shardIdx];
|
|
||||||
|
|
||||||
return shard.getOrDefault(key, NO_VALUE);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,170 +0,0 @@
|
|||||||
package nu.marginalia.lexicon;
|
|
||||||
|
|
||||||
import io.prometheus.client.Gauge;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.dict.DictionaryMap;
|
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalFingerprint;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.charset.StandardCharsets;
|
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
|
||||||
import java.util.concurrent.locks.Lock;
|
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
|
||||||
|
|
||||||
/** The keyword lexicon is used to map keywords to unique numeric IDs.
|
|
||||||
* This class is used to both construct the lexicon, and to read from it.
|
|
||||||
* <p>
|
|
||||||
* Readers will want to use the KeywordLexiconReadOnlyView wrapper, as it
|
|
||||||
* only exposes read-only methods and hides the mutating methods.
|
|
||||||
* <p>
|
|
||||||
* Between instances, the lexicon is stored in a journal file, exactly in the
|
|
||||||
* order they were received by the writer. The journal file is then replayed
|
|
||||||
* on startup to reconstruct the lexicon, giving each term an ID according to
|
|
||||||
* the order they are loaded. It is therefore important that the journal file
|
|
||||||
* is not tampered with, as this will cause the lexicon to be corrupted.
|
|
||||||
* */
|
|
||||||
|
|
||||||
public class KeywordLexicon implements AutoCloseable {
|
|
||||||
private final DictionaryMap reverseIndex;
|
|
||||||
|
|
||||||
private final ReadWriteLock memoryLock = new ReentrantReadWriteLock();
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private static final AtomicInteger instances = new AtomicInteger();
|
|
||||||
|
|
||||||
private static final Gauge request_time_metrics
|
|
||||||
= Gauge.build("wmsa_edge_index_dictionary_size", "Dictionary Size")
|
|
||||||
.register();
|
|
||||||
private final KeywordLexiconJournal journal;
|
|
||||||
|
|
||||||
private volatile KeywordLexiconJournalFingerprint fingerprint = null;
|
|
||||||
|
|
||||||
private final MurmurHash3_128 hasher = new MurmurHash3_128();
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public KeywordLexicon(KeywordLexiconJournal keywordLexiconJournal) {
|
|
||||||
|
|
||||||
journal = keywordLexiconJournal;
|
|
||||||
reverseIndex = DictionaryMap.create();
|
|
||||||
|
|
||||||
logger.info("Creating dictionary writer");
|
|
||||||
|
|
||||||
if (!instances.compareAndSet(0, 1)) {
|
|
||||||
logger.error("MULTIPLE LEXICON INSTANCES!");
|
|
||||||
}
|
|
||||||
|
|
||||||
reload();
|
|
||||||
|
|
||||||
logger.info("Done creating dictionary writer");
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean needsReload() throws IOException {
|
|
||||||
var newFingerprint = journal.journalFingerprint();
|
|
||||||
return !newFingerprint.equals(fingerprint);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Reload the lexicon from the journal */
|
|
||||||
public void reload() throws IOException {
|
|
||||||
var lock = memoryLock.writeLock();
|
|
||||||
lock.lock();
|
|
||||||
try {
|
|
||||||
reverseIndex.clear();
|
|
||||||
journal.loadFile(bytes -> reverseIndex.put(hasher.hash(bytes)));
|
|
||||||
fingerprint = journal.journalFingerprint();
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
lock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get method that inserts the word into the lexicon if it is not present */
|
|
||||||
public int getOrInsert(String macroWord) {
|
|
||||||
return getOrInsert(macroWord.getBytes(StandardCharsets.UTF_8));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get method that inserts the word into the lexicon if it is not present */
|
|
||||||
@SneakyThrows
|
|
||||||
private int getOrInsert(byte[] bytes) {
|
|
||||||
if (bytes.length >= Byte.MAX_VALUE) {
|
|
||||||
logger.warn("getOrInsert({}), illegal length {}", new String(bytes), bytes.length);
|
|
||||||
return DictionaryMap.NO_VALUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
final long key = hasher.hash(bytes);
|
|
||||||
|
|
||||||
int idx = getReadOnly(key);
|
|
||||||
|
|
||||||
if (idx < 0) {
|
|
||||||
idx = insertNew(key, bytes);
|
|
||||||
}
|
|
||||||
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
|
|
||||||
private int insertNew(long key, byte[] bytes) throws InterruptedException {
|
|
||||||
Lock lock = memoryLock.writeLock();
|
|
||||||
int idx;
|
|
||||||
try {
|
|
||||||
lock.lock();
|
|
||||||
|
|
||||||
// Check again to prevent race condition
|
|
||||||
if ((idx = reverseIndex.get(key)) >= 0)
|
|
||||||
return idx;
|
|
||||||
|
|
||||||
journal.enqueue(bytes);
|
|
||||||
idx = reverseIndex.put(key);
|
|
||||||
request_time_metrics.set(reverseIndex.size());
|
|
||||||
|
|
||||||
return idx;
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
lock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get method that does not modify the lexicon if the word is not present */
|
|
||||||
public int getReadOnly(String word) {
|
|
||||||
final byte[] bytes = word.getBytes(StandardCharsets.UTF_8);
|
|
||||||
return getReadOnly(hasher.hash(bytes));
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Get method that does not modify the lexicon if the word is not present */
|
|
||||||
public int getReadOnly(long hashedKey) {
|
|
||||||
Lock lock = memoryLock.readLock();
|
|
||||||
try {
|
|
||||||
lock.lock();
|
|
||||||
return reverseIndex.get(hashedKey);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
lock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public long size() {
|
|
||||||
Lock lock = memoryLock.readLock();
|
|
||||||
try {
|
|
||||||
lock.lock();
|
|
||||||
return reverseIndex.size();
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
lock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void close() throws Exception {
|
|
||||||
logger.warn("Closing Lexicon");
|
|
||||||
|
|
||||||
journal.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void commitToDisk() {
|
|
||||||
journal.commitToDisk();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,42 +0,0 @@
|
|||||||
package nu.marginalia.lexicon;
|
|
||||||
|
|
||||||
import com.google.common.cache.Cache;
|
|
||||||
import com.google.common.cache.CacheBuilder;
|
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
/** A read-only view of a keyword lexicon.
|
|
||||||
*
|
|
||||||
* @see KeywordLexicon
|
|
||||||
* */
|
|
||||||
public class KeywordLexiconReadOnlyView {
|
|
||||||
private final KeywordLexicon writer;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private final Cache<String, Integer> cache = CacheBuilder.newBuilder().maximumSize(10_000).expireAfterAccess(60, TimeUnit.SECONDS).build();
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public KeywordLexiconReadOnlyView(KeywordLexicon writer) {
|
|
||||||
this.writer = writer;
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public int get(String word) {
|
|
||||||
return cache.get(word, () -> writer.getReadOnly(word));
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean suggestReload() throws IOException {
|
|
||||||
if (writer.needsReload()) {
|
|
||||||
logger.info("Reloading lexicon");
|
|
||||||
writer.reload();
|
|
||||||
cache.invalidateAll();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
logger.info("Foregoing lexicon reload");
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,114 +0,0 @@
|
|||||||
package nu.marginalia.lexicon.journal;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
import java.nio.file.attribute.BasicFileAttributes;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
/** The journal for the keyword lexicon.
|
|
||||||
* It's used both for writing the lexicon, but also for reconstructing it for reading later.
|
|
||||||
*/
|
|
||||||
public class KeywordLexiconJournal {
|
|
||||||
|
|
||||||
private static final boolean noCommit = Boolean.getBoolean("DictionaryJournal.noCommit");
|
|
||||||
|
|
||||||
private final KeywordLexiconJournalCommitQueue commitQueue;
|
|
||||||
private KeywordLexiconJournalFile journalFile;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final Thread commitToDiskThread;
|
|
||||||
|
|
||||||
private volatile boolean running = true;
|
|
||||||
private final Path journalFilePath;
|
|
||||||
|
|
||||||
/** Create a new journal.
|
|
||||||
*
|
|
||||||
* @param file The file to use for the journal.
|
|
||||||
* @param mode The mode to use for the journal. If READ_ONLY, the journal will be read-only and refuse
|
|
||||||
* to accept new entries.
|
|
||||||
*/
|
|
||||||
public KeywordLexiconJournal(File file, KeywordLexiconJournalMode mode) throws IOException {
|
|
||||||
journalFilePath = file.toPath();
|
|
||||||
|
|
||||||
if (mode == KeywordLexiconJournalMode.READ_WRITE) {
|
|
||||||
commitQueue = new KeywordLexiconJournalCommitQueue();
|
|
||||||
journalFile = new KeywordLexiconJournalFile(file);
|
|
||||||
|
|
||||||
commitToDiskThread = new Thread(this::commitToDiskRunner, "CommitToDiskThread");
|
|
||||||
commitToDiskThread.start();
|
|
||||||
|
|
||||||
Runtime.getRuntime().addShutdownHook(new Thread(this::commitToDisk));
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
journalFile = new KeywordLexiconJournalFile(file);
|
|
||||||
|
|
||||||
commitQueue = null;
|
|
||||||
commitToDiskThread = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void enqueue(byte[] word) throws InterruptedException {
|
|
||||||
if (null == commitQueue)
|
|
||||||
throw new UnsupportedOperationException("Lexicon journal is read-only");
|
|
||||||
|
|
||||||
commitQueue.enqueue(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
public KeywordLexiconJournalFingerprint journalFingerprint() throws IOException {
|
|
||||||
var attributes = Files.readAttributes(journalFilePath, BasicFileAttributes.class);
|
|
||||||
|
|
||||||
long cTime = attributes.creationTime().toMillis();
|
|
||||||
long mTime = attributes.lastModifiedTime().toMillis();
|
|
||||||
long size = attributes.size();
|
|
||||||
|
|
||||||
return new KeywordLexiconJournalFingerprint(cTime, mTime, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void commitToDiskRunner() {
|
|
||||||
if (noCommit) return;
|
|
||||||
|
|
||||||
while (running) {
|
|
||||||
try {
|
|
||||||
Thread.sleep(1000);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
e.printStackTrace();
|
|
||||||
}
|
|
||||||
commitToDisk();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void commitToDisk() {
|
|
||||||
List<byte[]> entries = commitQueue.getQueuedEntries();
|
|
||||||
|
|
||||||
journalFile.writeEntriesToJournal(entries);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws Exception {
|
|
||||||
logger.info("Closing Journal");
|
|
||||||
running = false;
|
|
||||||
|
|
||||||
if (commitToDiskThread != null) {
|
|
||||||
commitToDiskThread.join();
|
|
||||||
commitToDisk();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (journalFile != null) {
|
|
||||||
journalFile.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void loadFile(Consumer<byte[]> loadJournalEntry) throws IOException {
|
|
||||||
if (journalFile != null) {
|
|
||||||
journalFile.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
journalFile = new KeywordLexiconJournalFile(journalFilePath.toFile());
|
|
||||||
journalFile.loadFile(loadJournalEntry);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,48 +0,0 @@
|
|||||||
package nu.marginalia.lexicon.journal;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
/** An in-memory queue for lexicon journal entries used to improve the performance of
|
|
||||||
* large bursts of insert-operations.
|
|
||||||
*/
|
|
||||||
class KeywordLexiconJournalCommitQueue {
|
|
||||||
private final ArrayList<byte[]> commitQueue = new ArrayList<>(10_000);
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
private static final long BACK_PRESSURE_LIMIT = 25_000;
|
|
||||||
|
|
||||||
public synchronized void enqueue(byte[] word) throws InterruptedException {
|
|
||||||
for (int queueSize = commitQueue.size();
|
|
||||||
queueSize >= BACK_PRESSURE_LIMIT;
|
|
||||||
queueSize = commitQueue.size())
|
|
||||||
{
|
|
||||||
wait();
|
|
||||||
}
|
|
||||||
|
|
||||||
commitQueue.add(word);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public synchronized List<byte[]> getQueuedEntries() {
|
|
||||||
List<byte[]> data;
|
|
||||||
if (commitQueue.isEmpty()) {
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
data = new ArrayList<>(commitQueue);
|
|
||||||
commitQueue.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
notifyAll();
|
|
||||||
|
|
||||||
if (data.size() > BACK_PRESSURE_LIMIT) {
|
|
||||||
logger.warn("Lexicon Journal Backpressure: {}", data.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
return data;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,162 +0,0 @@
|
|||||||
package nu.marginalia.lexicon.journal;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.RandomAccessFile;
|
|
||||||
import java.nio.ByteBuffer;
|
|
||||||
import java.nio.channels.FileChannel;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.locks.Lock;
|
|
||||||
import java.util.concurrent.locks.ReadWriteLock;
|
|
||||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
public class KeywordLexiconJournalFile implements AutoCloseable {
|
|
||||||
private final RandomAccessFile journalFileRAF;
|
|
||||||
private final File journalFile;
|
|
||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
|
||||||
|
|
||||||
private final ReadWriteLock diskLock = new ReentrantReadWriteLock();
|
|
||||||
|
|
||||||
public KeywordLexiconJournalFile(File journalFile) throws IOException {
|
|
||||||
this.journalFileRAF = new RandomAccessFile(journalFile, "rw");
|
|
||||||
this.journalFile = journalFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void rewind() throws IOException {
|
|
||||||
journalFileRAF.seek(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void loadFile(Consumer<byte[]> acceptEntry) throws IOException {
|
|
||||||
if (!journalFile.exists()) {
|
|
||||||
logger.info("File {} does not exist, can't load", journalFile);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Reading {}", journalFile);
|
|
||||||
|
|
||||||
long pos;
|
|
||||||
if (journalFileRAF.length() < 8) {
|
|
||||||
pos = 8;
|
|
||||||
journalFileRAF.writeLong(pos);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
pos = journalFileRAF.readLong();
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.info("Length {} ({})", pos, journalFileRAF.length());
|
|
||||||
if (pos == 8) {
|
|
||||||
logger.info("Empty DB");
|
|
||||||
}
|
|
||||||
|
|
||||||
ByteBuffer buffer = ByteBuffer.allocateDirect(8192);
|
|
||||||
|
|
||||||
var channel = journalFileRAF.getChannel();
|
|
||||||
|
|
||||||
long cp = channel.position();
|
|
||||||
try {
|
|
||||||
buffer.limit(0);
|
|
||||||
long loaded = 0;
|
|
||||||
|
|
||||||
while (cp < pos || buffer.hasRemaining()) {
|
|
||||||
if (buffer.limit() - buffer.position() < 4) {
|
|
||||||
buffer.compact();
|
|
||||||
|
|
||||||
long rb = channel.read(buffer);
|
|
||||||
if (rb <= 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
cp += rb;
|
|
||||||
buffer.flip();
|
|
||||||
}
|
|
||||||
|
|
||||||
int len = buffer.get() & 0xFF;
|
|
||||||
if (len > Byte.MAX_VALUE) {
|
|
||||||
logger.warn("Found keyword with impossible length {} near {}, likely corruption", len, cp);
|
|
||||||
}
|
|
||||||
while (buffer.limit() - buffer.position() < len) {
|
|
||||||
buffer.compact();
|
|
||||||
int rb = channel.read(buffer);
|
|
||||||
if (rb <= 0) break;
|
|
||||||
cp += rb;
|
|
||||||
buffer.flip();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (buffer.limit() < len) {
|
|
||||||
logger.warn("Partial write at end-of-file!");
|
|
||||||
|
|
||||||
if (cp >= pos) {
|
|
||||||
logger.info("... but it's ok");
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
byte[] data = new byte[len];
|
|
||||||
buffer.get(data);
|
|
||||||
if ((++loaded % 10_000_000) == 0L) {
|
|
||||||
logger.info("Loaded {} million items", loaded/1_000_000);
|
|
||||||
}
|
|
||||||
|
|
||||||
acceptEntry.accept(data);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.error("IO Exception", ex);
|
|
||||||
}
|
|
||||||
|
|
||||||
journalFileRAF.seek(pos);
|
|
||||||
}
|
|
||||||
|
|
||||||
private final ByteBuffer writeBuffer = ByteBuffer.allocateDirect(4096);
|
|
||||||
|
|
||||||
public void writeEntriesToJournal(List<byte[]> data) {
|
|
||||||
if (data.isEmpty())
|
|
||||||
return;
|
|
||||||
|
|
||||||
final FileChannel channel = journalFileRAF.getChannel();
|
|
||||||
|
|
||||||
if (!channel.isOpen()) {
|
|
||||||
throw new IllegalStateException("commitToDisk() with closed channel! Cannot commit!");
|
|
||||||
}
|
|
||||||
|
|
||||||
Lock writeLock = diskLock.writeLock();
|
|
||||||
try {
|
|
||||||
writeLock.lock();
|
|
||||||
|
|
||||||
long start = System.currentTimeMillis();
|
|
||||||
int ct = data.size();
|
|
||||||
|
|
||||||
for (byte[] itemBytes : data) {
|
|
||||||
writeBuffer.clear();
|
|
||||||
writeBuffer.put((byte) itemBytes.length);
|
|
||||||
writeBuffer.put(itemBytes);
|
|
||||||
writeBuffer.flip();
|
|
||||||
|
|
||||||
while (writeBuffer.position() < writeBuffer.limit())
|
|
||||||
channel.write(writeBuffer, channel.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
writeBuffer.clear();
|
|
||||||
writeBuffer.putLong(channel.size());
|
|
||||||
writeBuffer.flip();
|
|
||||||
channel.write(writeBuffer, 0);
|
|
||||||
|
|
||||||
channel.force(false);
|
|
||||||
|
|
||||||
logger.debug("Comitted {} items in {} ms", ct, System.currentTimeMillis() - start);
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
logger.error("Error during dictionary commit!!!", ex);
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
writeLock.unlock();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws IOException {
|
|
||||||
journalFileRAF.close();
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,10 +0,0 @@
|
|||||||
package nu.marginalia.lexicon.journal;
|
|
||||||
|
|
||||||
/** Contains values used to assess whether the lexicon is in sync with the journal
|
|
||||||
* or if it has been replaced with a newer version and should be reloaded
|
|
||||||
* */
|
|
||||||
public record KeywordLexiconJournalFingerprint(long createdTime,
|
|
||||||
long mTime,
|
|
||||||
long sizeBytes)
|
|
||||||
{
|
|
||||||
}
|
|
@ -1,6 +0,0 @@
|
|||||||
package nu.marginalia.lexicon.journal;
|
|
||||||
|
|
||||||
public enum KeywordLexiconJournalMode {
|
|
||||||
READ_ONLY,
|
|
||||||
READ_WRITE
|
|
||||||
}
|
|
@ -1,78 +0,0 @@
|
|||||||
package nu.marginalia.lexicon;
|
|
||||||
|
|
||||||
import nu.marginalia.dict.OnHeapDictionaryMap;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
|
||||||
import org.junit.jupiter.api.AfterEach;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
|
||||||
|
|
||||||
public class KeywordLexiconTest {
|
|
||||||
|
|
||||||
private Path journalFile;
|
|
||||||
private KeywordLexicon lexicon;
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
public void setUp() throws IOException {
|
|
||||||
journalFile = Files.createTempFile(getClass().getSimpleName(), ".dat");
|
|
||||||
|
|
||||||
var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE);
|
|
||||||
lexicon = new KeywordLexicon(lexiconJournal);
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterEach
|
|
||||||
public void tearDown() throws Exception {
|
|
||||||
lexicon.close();
|
|
||||||
Files.delete(journalFile);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testConsistentInserts() {
|
|
||||||
int a = lexicon.getOrInsert("aaa");
|
|
||||||
int b = lexicon.getOrInsert("bbb");
|
|
||||||
int a2 = lexicon.getOrInsert("aaa");
|
|
||||||
int c = lexicon.getOrInsert("ccc");
|
|
||||||
|
|
||||||
assertEquals(a, a2);
|
|
||||||
assertNotEquals(a, b);
|
|
||||||
assertNotEquals(a, c);
|
|
||||||
assertNotEquals(b, c);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testInsertReplay() {
|
|
||||||
int a = lexicon.getOrInsert("aaa");
|
|
||||||
int b = lexicon.getOrInsert("bbb");
|
|
||||||
int c = lexicon.getOrInsert("ccc");
|
|
||||||
|
|
||||||
assertEquals(a, lexicon.getReadOnly("aaa"));
|
|
||||||
assertEquals(b, lexicon.getReadOnly("bbb"));
|
|
||||||
assertEquals(c, lexicon.getReadOnly("ccc"));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testReload() throws IOException {
|
|
||||||
int a = lexicon.getOrInsert("aaa");
|
|
||||||
int b = lexicon.getOrInsert("bbb");
|
|
||||||
int c = lexicon.getOrInsert("ccc");
|
|
||||||
lexicon.commitToDisk();
|
|
||||||
|
|
||||||
var lexiconJournal = new KeywordLexiconJournal(journalFile.toFile(), KeywordLexiconJournalMode.READ_WRITE);
|
|
||||||
try (var anotherLexicon = new KeywordLexicon(lexiconJournal)) {
|
|
||||||
assertEquals(a, anotherLexicon.getReadOnly("aaa"));
|
|
||||||
assertEquals(b, anotherLexicon.getReadOnly("bbb"));
|
|
||||||
assertEquals(c, anotherLexicon.getReadOnly("ccc"));
|
|
||||||
}
|
|
||||||
catch (Exception ex) {
|
|
||||||
Assertions.fail("???", ex);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
@ -9,8 +9,8 @@ import nu.marginalia.model.idx.DocumentFlags;
|
|||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.ranking.factors.*;
|
import nu.marginalia.ranking.factors.*;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -369,7 +369,8 @@ public class TwoArrayOperations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (aPos < aEnd) {
|
while (aPos < aEnd) {
|
||||||
long val = a.get(aPos+=stepSize);
|
long val = a.get(aPos);
|
||||||
|
aPos+=stepSize;
|
||||||
if (distinct == 0 || val != lastValue) {
|
if (distinct == 0 || val != lastValue) {
|
||||||
distinct++;
|
distinct++;
|
||||||
}
|
}
|
||||||
@ -377,7 +378,8 @@ public class TwoArrayOperations {
|
|||||||
}
|
}
|
||||||
|
|
||||||
while (bPos < bEnd) {
|
while (bPos < bEnd) {
|
||||||
long val = b.get(bPos+=stepSize);
|
long val = b.get(bPos);
|
||||||
|
bPos+=stepSize;
|
||||||
if (distinct == 0 || val != lastValue) {
|
if (distinct == 0 || val != lastValue) {
|
||||||
distinct++;
|
distinct++;
|
||||||
}
|
}
|
||||||
|
@ -5,6 +5,7 @@ import nu.marginalia.array.LongArray;
|
|||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.LongStream;
|
import java.util.stream.LongStream;
|
||||||
@ -118,4 +119,31 @@ class TwoArrayOperationsTest {
|
|||||||
assertEquals(distinctSize, mergedSize);
|
assertEquals(distinctSize, mergedSize);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void mergeArrays2() {
|
||||||
|
LongArray left = LongArray.allocate(4);
|
||||||
|
LongArray right = LongArray.allocate(2);
|
||||||
|
LongArray out = LongArray.allocate(4);
|
||||||
|
left.set(0, 40, 3, 41, 4);
|
||||||
|
right.set(0, 40, 5);
|
||||||
|
|
||||||
|
System.out.println(Arrays.toString(longArrayToJavaArray(left)));
|
||||||
|
System.out.println(Arrays.toString(longArrayToJavaArray(right)));
|
||||||
|
System.out.println(Arrays.toString(longArrayToJavaArray(out)));
|
||||||
|
long numDistinct = TwoArrayOperations.countDistinctElementsN(2, left, right, 0, 4, 0, 2);
|
||||||
|
System.out.println(numDistinct);
|
||||||
|
System.out.println(numDistinct);
|
||||||
|
|
||||||
|
TwoArrayOperations.mergeArrays2(out, left, right, 0, 4, 0, 4, 0, 2);
|
||||||
|
|
||||||
|
System.out.println(Arrays.toString(longArrayToJavaArray(out)));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
long[] longArrayToJavaArray(LongArray longArray) {
|
||||||
|
long[] vals = new long[(int) longArray.size()];
|
||||||
|
longArray.get(0, vals);
|
||||||
|
return vals;
|
||||||
|
}
|
||||||
}
|
}
|
@ -16,7 +16,7 @@ import org.jsoup.nodes.Element;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.mq;
|
package nu.marginalia.mq;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.mq.inbox.MqAsynchronousInbox;
|
import nu.marginalia.mq.inbox.MqAsynchronousInbox;
|
||||||
import nu.marginalia.mq.inbox.MqInboxIf;
|
import nu.marginalia.mq.inbox.MqInboxIf;
|
||||||
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
import nu.marginalia.mq.inbox.MqSingleShotInbox;
|
||||||
@ -7,8 +9,6 @@ import nu.marginalia.mq.inbox.MqSynchronousInbox;
|
|||||||
import nu.marginalia.mq.outbox.MqOutbox;
|
import nu.marginalia.mq.outbox.MqOutbox;
|
||||||
import nu.marginalia.mq.persistence.MqPersistence;
|
import nu.marginalia.mq.persistence.MqPersistence;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Singleton;
|
|
||||||
import java.util.UUID;
|
import java.util.UUID;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
|
@ -10,8 +10,8 @@ import org.jetbrains.annotations.NotNull;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
|
@ -8,8 +8,8 @@ import org.jsoup.nodes.Document;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
|
||||||
|
@ -2,7 +2,7 @@ package nu.marginalia.converting.processor;
|
|||||||
|
|
||||||
import org.jsoup.nodes.Document;
|
import org.jsoup.nodes.Document;
|
||||||
|
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class MetaRobotsTag {
|
public class MetaRobotsTag {
|
||||||
|
@ -9,7 +9,7 @@ import nu.marginalia.model.EdgeUrl;
|
|||||||
import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor;
|
import nu.marginalia.converting.processor.logic.links.CommonKeywordExtractor;
|
||||||
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
import nu.marginalia.converting.processor.logic.links.TopKeywords;
|
||||||
|
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
@ -6,13 +6,12 @@ import com.google.inject.Inject;
|
|||||||
import nu.marginalia.db.storage.FileStorageService;
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
import nu.marginalia.db.storage.model.FileStorage;
|
import nu.marginalia.db.storage.model.FileStorage;
|
||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
|
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
import nu.marginalia.index.full.ReverseIndexFullConverter;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.full.ReverseIndexFullFileNames;
|
import nu.marginalia.index.journal.reader.IndexJournalReadEntry;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
import nu.marginalia.index.priority.ReverseIndexPrioFileNames;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityConverter;
|
|
||||||
import nu.marginalia.model.gson.GsonFactory;
|
import nu.marginalia.model.gson.GsonFactory;
|
||||||
import nu.marginalia.mq.MessageQueueFactory;
|
import nu.marginalia.mq.MessageQueueFactory;
|
||||||
import nu.marginalia.mq.MqMessage;
|
import nu.marginalia.mq.MqMessage;
|
||||||
@ -23,7 +22,6 @@ import nu.marginalia.mqapi.index.IndexName;
|
|||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
import nu.marginalia.ranking.DomainRankings;
|
||||||
import nu.marginalia.service.module.DatabaseModule;
|
import nu.marginalia.service.module.DatabaseModule;
|
||||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -97,52 +95,35 @@ public class IndexConstructorMain {
|
|||||||
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
||||||
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
Path inputFile = IndexJournalFileNames.resolve(indexStaging.asPath());
|
|
||||||
Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
Path outputFileDocs = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
Path outputFileWords = ReverseIndexFullFileNames.resolve(indexLive.asPath(), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
Path tmpDir = indexStaging.asPath().resolve("tmp");
|
Path tmpDir = indexStaging.asPath().resolve("tmp");
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var journalReader = new IndexJournalReaderSingleCompressedFile(inputFile);
|
|
||||||
|
|
||||||
ReverseIndexFullConverter converter = new ReverseIndexFullConverter(
|
ReverseIndexConstructor.
|
||||||
heartbeat,
|
createReverseIndex(IndexJournalReader::singleFile,
|
||||||
tmpDir,
|
indexStaging.asPath(),
|
||||||
journalReader,
|
tmpDir,
|
||||||
domainRankings,
|
outputFileDocs,
|
||||||
outputFileWords,
|
outputFileWords);
|
||||||
outputFileDocs
|
|
||||||
);
|
|
||||||
|
|
||||||
converter.convert();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void createPrioReverseIndex() throws SQLException, IOException {
|
private void createPrioReverseIndex() throws SQLException, IOException {
|
||||||
|
|
||||||
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
||||||
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
Path inputFile = IndexJournalFileNames.resolve(indexStaging.asPath());
|
|
||||||
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
Path outputFileWords = ReverseIndexPrioFileNames.resolve(indexLive.asPath(), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
Path tmpDir = indexStaging.asPath().resolve("tmp");
|
Path tmpDir = indexStaging.asPath().resolve("tmp");
|
||||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
var journalReader = new IndexJournalReaderSingleCompressedFile(inputFile);
|
ReverseIndexConstructor.
|
||||||
|
createReverseIndex(IndexJournalReader::singleFileWithPriorityFilters,
|
||||||
ReverseIndexPriorityConverter converter = new ReverseIndexPriorityConverter(
|
indexStaging.asPath(), tmpDir, outputFileDocs, outputFileWords);
|
||||||
heartbeat,
|
|
||||||
tmpDir,
|
|
||||||
journalReader,
|
|
||||||
domainRankings,
|
|
||||||
outputFileWords,
|
|
||||||
outputFileDocs
|
|
||||||
);
|
|
||||||
|
|
||||||
converter.convert();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createForwardIndex() throws SQLException, IOException {
|
private void createForwardIndex() throws SQLException, IOException {
|
||||||
@ -150,12 +131,11 @@ public class IndexConstructorMain {
|
|||||||
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
FileStorage indexLive = fileStorageService.getStorageByType(FileStorageType.INDEX_LIVE);
|
||||||
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
FileStorage indexStaging = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
Path inputFile = IndexJournalFileNames.resolve(indexStaging.asPath());
|
|
||||||
Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
Path outputFileDocsId = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
||||||
Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
Path outputFileDocsData = ForwardIndexFileNames.resolve(indexLive.asPath(), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat,
|
ForwardIndexConverter converter = new ForwardIndexConverter(heartbeat,
|
||||||
inputFile.toFile(),
|
IndexJournalReader.paging(indexStaging.asPath()),
|
||||||
outputFileDocsId,
|
outputFileDocsId,
|
||||||
outputFileDocsData,
|
outputFileDocsData,
|
||||||
domainRankings
|
domainRankings
|
||||||
|
@ -28,7 +28,6 @@ dependencies {
|
|||||||
implementation project(':code:common:service-discovery')
|
implementation project(':code:common:service-discovery')
|
||||||
implementation project(':code:common:service-client')
|
implementation project(':code:common:service-client')
|
||||||
implementation project(':code:common:linkdb')
|
implementation project(':code:common:linkdb')
|
||||||
implementation project(':code:features-index:lexicon')
|
|
||||||
implementation project(':code:features-index:index-journal')
|
implementation project(':code:features-index:index-journal')
|
||||||
implementation project(':code:libraries:message-queue')
|
implementation project(':code:libraries:message-queue')
|
||||||
implementation project(':code:libraries:language-processing')
|
implementation project(':code:libraries:language-processing')
|
||||||
|
@ -1,56 +1,27 @@
|
|||||||
package nu.marginalia.loading.loader;
|
package nu.marginalia.loading.loader;
|
||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import lombok.SneakyThrows;
|
|
||||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginalia.model.EdgeUrl;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.util.concurrent.LinkedBlockingQueue;
|
public class IndexLoadKeywords {
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
public class IndexLoadKeywords implements Runnable {
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class);
|
private static final Logger logger = LoggerFactory.getLogger(IndexLoadKeywords.class);
|
||||||
|
|
||||||
private final LinkedBlockingQueue<InsertTask> insertQueue = new LinkedBlockingQueue<>(32);
|
|
||||||
private final LoaderIndexJournalWriter journalWriter;
|
private final LoaderIndexJournalWriter journalWriter;
|
||||||
|
|
||||||
private record InsertTask(long combinedId,
|
|
||||||
int features,
|
|
||||||
DocumentMetadata metadata,
|
|
||||||
DocumentKeywords wordSet) {}
|
|
||||||
|
|
||||||
private final Thread runThread;
|
|
||||||
|
|
||||||
private volatile boolean canceled = false;
|
private volatile boolean canceled = false;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public IndexLoadKeywords(LoaderIndexJournalWriter journalWriter) {
|
public IndexLoadKeywords(LoaderIndexJournalWriter journalWriter) {
|
||||||
this.journalWriter = journalWriter;
|
this.journalWriter = journalWriter;
|
||||||
runThread = new Thread(this, getClass().getSimpleName());
|
|
||||||
runThread.start();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void run() {
|
|
||||||
while (!canceled) {
|
|
||||||
var data = insertQueue.poll(1, TimeUnit.SECONDS);
|
|
||||||
if (data != null) {
|
|
||||||
journalWriter.putWords(data.combinedId,
|
|
||||||
data.features,
|
|
||||||
data.metadata(),
|
|
||||||
data.wordSet);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
if (!canceled) {
|
if (!canceled) {
|
||||||
canceled = true;
|
|
||||||
runThread.join();
|
|
||||||
journalWriter.close();
|
journalWriter.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -60,7 +31,7 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
EdgeUrl url,
|
EdgeUrl url,
|
||||||
int features,
|
int features,
|
||||||
DocumentMetadata metadata,
|
DocumentMetadata metadata,
|
||||||
DocumentKeywords words) throws InterruptedException {
|
DocumentKeywords words) {
|
||||||
long combinedId = UrlIdCodec.encodeId(loaderData.getTargetDomainId(), ordinal);
|
long combinedId = UrlIdCodec.encodeId(loaderData.getTargetDomainId(), ordinal);
|
||||||
|
|
||||||
if (combinedId <= 0) {
|
if (combinedId <= 0) {
|
||||||
@ -68,6 +39,9 @@ public class IndexLoadKeywords implements Runnable {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
insertQueue.put(new InsertTask(combinedId, features, metadata, words));
|
journalWriter.putWords(combinedId,
|
||||||
|
features,
|
||||||
|
metadata,
|
||||||
|
words);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -12,6 +12,7 @@ import nu.marginalia.converting.instruction.instructions.LoadProcessedDocumentWi
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
@ -33,14 +34,14 @@ public class Loader implements Interpreter, AutoCloseable {
|
|||||||
public final LoaderData data;
|
public final LoaderData data;
|
||||||
|
|
||||||
public Loader(int sizeHint,
|
public Loader(int sizeHint,
|
||||||
|
OldDomains oldDomains,
|
||||||
SqlLoadDomains sqlLoadDomains,
|
SqlLoadDomains sqlLoadDomains,
|
||||||
SqlLoadDomainLinks sqlLoadDomainLinks,
|
SqlLoadDomainLinks sqlLoadDomainLinks,
|
||||||
SqlLoadProcessedDomain sqlLoadProcessedDomain,
|
SqlLoadProcessedDomain sqlLoadProcessedDomain,
|
||||||
LdbLoadProcessedDocument loadProcessedDocument,
|
LdbLoadProcessedDocument loadProcessedDocument,
|
||||||
SqlLoadDomainMetadata sqlLoadDomainMetadata,
|
SqlLoadDomainMetadata sqlLoadDomainMetadata,
|
||||||
IndexLoadKeywords indexLoadKeywords)
|
IndexLoadKeywords indexLoadKeywords) {
|
||||||
{
|
data = new LoaderData(oldDomains, sizeHint);
|
||||||
data = new LoaderData(sizeHint);
|
|
||||||
|
|
||||||
this.sqlLoadDomains = sqlLoadDomains;
|
this.sqlLoadDomains = sqlLoadDomains;
|
||||||
this.sqlLoadDomainLinks = sqlLoadDomainLinks;
|
this.sqlLoadDomainLinks = sqlLoadDomainLinks;
|
||||||
@ -93,11 +94,7 @@ public class Loader implements Interpreter, AutoCloseable {
|
|||||||
}
|
}
|
||||||
@Override
|
@Override
|
||||||
public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) {
|
public void loadKeywords(EdgeUrl url, int ordinal, int features, DocumentMetadata metadata, DocumentKeywords words) {
|
||||||
try {
|
indexLoadKeywords.load(data, ordinal, url, features, metadata, words);
|
||||||
indexLoadKeywords.load(data, ordinal, url, features, metadata, words);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -1,17 +1,16 @@
|
|||||||
package nu.marginalia.loading.loader;
|
package nu.marginalia.loading.loader;
|
||||||
|
|
||||||
import gnu.trove.map.hash.TObjectIntHashMap;
|
|
||||||
import nu.marginalia.model.EdgeDomain;
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
|
||||||
public class LoaderData {
|
public class LoaderData {
|
||||||
|
|
||||||
private final TObjectIntHashMap<EdgeDomain> domainIds;
|
private final OldDomains oldDomains;
|
||||||
private EdgeDomain targetDomain;
|
private EdgeDomain targetDomain;
|
||||||
public final int sizeHint;
|
public final int sizeHint;
|
||||||
private int targetDomainId = -1;
|
private int targetDomainId = -1;
|
||||||
|
|
||||||
public LoaderData(int sizeHint) {
|
public LoaderData(OldDomains oldDomains, int sizeHint) {
|
||||||
domainIds = new TObjectIntHashMap<>(10);
|
this.oldDomains = oldDomains;
|
||||||
this.sizeHint = sizeHint;
|
this.sizeHint = sizeHint;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -21,17 +20,18 @@ public class LoaderData {
|
|||||||
public EdgeDomain getTargetDomain() {
|
public EdgeDomain getTargetDomain() {
|
||||||
return targetDomain;
|
return targetDomain;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getTargetDomainId() {
|
public int getTargetDomainId() {
|
||||||
if (targetDomainId < 0)
|
if (targetDomainId < 0)
|
||||||
targetDomainId = domainIds.get(targetDomain);
|
targetDomainId = oldDomains.getId(targetDomain);
|
||||||
return targetDomainId;
|
return targetDomainId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void addDomain(EdgeDomain domain, int id) {
|
public void addDomain(EdgeDomain domain, int id) {
|
||||||
domainIds.put(domain, id);
|
oldDomains.add(domain, id);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getDomainId(EdgeDomain domain) {
|
public int getDomainId(EdgeDomain domain) {
|
||||||
return domainIds.get(domain);
|
return oldDomains.getId(domain);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ package nu.marginalia.loading.loader;
|
|||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
|
|
||||||
public class LoaderFactory {
|
public class LoaderFactory {
|
||||||
|
private final OldDomains oldDomains;
|
||||||
private final SqlLoadDomains sqlLoadDomains;
|
private final SqlLoadDomains sqlLoadDomains;
|
||||||
private final SqlLoadDomainLinks sqlLoadDomainLinks;
|
private final SqlLoadDomainLinks sqlLoadDomainLinks;
|
||||||
private final SqlLoadProcessedDomain sqlLoadProcessedDomain;
|
private final SqlLoadProcessedDomain sqlLoadProcessedDomain;
|
||||||
@ -11,12 +12,14 @@ public class LoaderFactory {
|
|||||||
private final IndexLoadKeywords indexLoadKeywords;
|
private final IndexLoadKeywords indexLoadKeywords;
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public LoaderFactory(SqlLoadDomains sqlLoadDomains,
|
public LoaderFactory(OldDomains oldDomains,
|
||||||
|
SqlLoadDomains sqlLoadDomains,
|
||||||
SqlLoadDomainLinks sqlLoadDomainLinks,
|
SqlLoadDomainLinks sqlLoadDomainLinks,
|
||||||
SqlLoadProcessedDomain sqlLoadProcessedDomain,
|
SqlLoadProcessedDomain sqlLoadProcessedDomain,
|
||||||
LdbLoadProcessedDocument sqlLoadProcessedDocument,
|
LdbLoadProcessedDocument sqlLoadProcessedDocument,
|
||||||
SqlLoadDomainMetadata sqlLoadDomainMetadata,
|
SqlLoadDomainMetadata sqlLoadDomainMetadata,
|
||||||
IndexLoadKeywords indexLoadKeywords) {
|
IndexLoadKeywords indexLoadKeywords) {
|
||||||
|
this.oldDomains = oldDomains;
|
||||||
|
|
||||||
this.sqlLoadDomains = sqlLoadDomains;
|
this.sqlLoadDomains = sqlLoadDomains;
|
||||||
this.sqlLoadDomainLinks = sqlLoadDomainLinks;
|
this.sqlLoadDomainLinks = sqlLoadDomainLinks;
|
||||||
@ -27,6 +30,6 @@ public class LoaderFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public Loader create(int sizeHint) {
|
public Loader create(int sizeHint) {
|
||||||
return new Loader(sizeHint, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords);
|
return new Loader(sizeHint, oldDomains, sqlLoadDomains, sqlLoadDomainLinks, sqlLoadProcessedDomain, sqlLoadProcessedDocument, sqlLoadDomainMetadata, indexLoadKeywords);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -5,15 +5,13 @@ import com.google.inject.Singleton;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import nu.marginalia.db.storage.FileStorageService;
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
import nu.marginalia.dict.OffHeapDictionaryHashMap;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||||
|
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import nu.marginallia.index.journal.IndexJournalFileNames;
|
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
@ -21,40 +19,30 @@ import org.slf4j.LoggerFactory;
|
|||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.attribute.PosixFilePermissions;
|
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.concurrent.*;
|
|
||||||
|
import static nu.marginalia.index.journal.model.IndexJournalEntryData.MAX_LENGTH;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class LoaderIndexJournalWriter {
|
public class LoaderIndexJournalWriter {
|
||||||
|
|
||||||
private final KeywordLexicon lexicon;
|
|
||||||
private final IndexJournalWriter indexWriter;
|
private final IndexJournalWriter indexWriter;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
|
private static final Logger logger = LoggerFactory.getLogger(LoaderIndexJournalWriter.class);
|
||||||
|
|
||||||
@Inject
|
@Inject
|
||||||
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
|
public LoaderIndexJournalWriter(FileStorageService fileStorageService) throws IOException, SQLException {
|
||||||
var lexiconArea = fileStorageService.getStorageByType(FileStorageType.LEXICON_STAGING);
|
|
||||||
var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
var indexArea = fileStorageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
|
|
||||||
var lexiconPath = lexiconArea.asPath().resolve("dictionary.dat");
|
var existingIndexFiles = IndexJournalFileNames.findJournalFiles(indexArea.asPath());
|
||||||
var indexPath = IndexJournalFileNames.resolve(indexArea.asPath());
|
for (var existingFile : existingIndexFiles) {
|
||||||
|
Files.delete(existingFile);
|
||||||
|
}
|
||||||
|
|
||||||
Files.deleteIfExists(indexPath);
|
indexWriter = new IndexJournalWriterPagingImpl(indexArea.asPath());
|
||||||
Files.deleteIfExists(lexiconPath);
|
|
||||||
|
|
||||||
Files.createFile(indexPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
|
||||||
Files.createFile(lexiconPath, PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rw-r--r--")));
|
|
||||||
|
|
||||||
lexicon = new KeywordLexicon(new KeywordLexiconJournal(lexiconPath.toFile(), KeywordLexiconJournalMode.READ_WRITE));
|
|
||||||
indexWriter = new IndexJournalWriterImpl(lexicon, indexPath);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private final LinkedBlockingQueue<Runnable> keywordInsertTaskQueue =
|
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||||
new LinkedBlockingQueue<>(65536);
|
|
||||||
private final ExecutorService keywordInsertionExecutor =
|
|
||||||
new ThreadPoolExecutor(8, 16, 1, TimeUnit.MINUTES, keywordInsertTaskQueue);
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void putWords(long combinedId,
|
public void putWords(long combinedId,
|
||||||
@ -71,60 +59,32 @@ public class LoaderIndexJournalWriter {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Due to the very bursty access patterns of this method, doing the actual insertions in separate threads
|
String[] words = wordSet.keywords();
|
||||||
// with a chonky work queue is a fairly decent improvement
|
long[] wordIds = new long[wordSet.size()];
|
||||||
for (var chunk : KeywordListChunker.chopList(wordSet, IndexJournalEntryData.MAX_LENGTH)) {
|
long[] meta = wordSet.metadata();
|
||||||
try {
|
|
||||||
keywordInsertionExecutor.submit(() -> loadWords(combinedId, features, metadata, chunk));
|
Arrays.parallelSetAll(wordIds, i -> hasher.hashNearlyASCII(words[i]));
|
||||||
}
|
|
||||||
catch (RejectedExecutionException ex) {
|
long[] buffer = new long[MAX_LENGTH * 2];
|
||||||
loadWords(combinedId, features, metadata, chunk);
|
for (int start = 0; start < words.length; ) {
|
||||||
|
int end = Math.min(start + MAX_LENGTH, words.length);
|
||||||
|
|
||||||
|
for (int i = 0; i < end - start; i++) {
|
||||||
|
buffer[2*i] = wordIds[i];
|
||||||
|
buffer[2*i + 1] = meta[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var entry = new IndexJournalEntryData(end-start, buffer);
|
||||||
|
var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode());
|
||||||
|
|
||||||
|
indexWriter.put(header, entry);
|
||||||
|
|
||||||
|
start = end;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void loadWords(long combinedId,
|
|
||||||
int features,
|
|
||||||
DocumentMetadata metadata,
|
|
||||||
DocumentKeywords wordSet) {
|
|
||||||
if (null == metadata) {
|
|
||||||
logger.warn("Null metadata for {}", combinedId);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
var entry = new IndexJournalEntryData(getOrInsertWordIds(wordSet.keywords(), wordSet.metadata()));
|
|
||||||
var header = new IndexJournalEntryHeader(combinedId, features, metadata.encode());
|
|
||||||
|
|
||||||
indexWriter.put(header, entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
private long[] getOrInsertWordIds(String[] words, long[] meta) {
|
|
||||||
long[] ids = new long[words.length*2];
|
|
||||||
int putIdx = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < words.length; i++) {
|
|
||||||
String word = words[i];
|
|
||||||
|
|
||||||
long id = lexicon.getOrInsert(word);
|
|
||||||
if (id != OffHeapDictionaryHashMap.NO_VALUE) {
|
|
||||||
ids[putIdx++] = id;
|
|
||||||
ids[putIdx++] = meta[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (putIdx != words.length*2) {
|
|
||||||
ids = Arrays.copyOf(ids, putIdx);
|
|
||||||
}
|
|
||||||
return ids;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void close() throws Exception {
|
public void close() throws Exception {
|
||||||
keywordInsertionExecutor.shutdown();
|
|
||||||
while (!keywordInsertionExecutor.awaitTermination(1, TimeUnit.DAYS)) {
|
|
||||||
// ...?
|
|
||||||
}
|
|
||||||
indexWriter.close();
|
indexWriter.close();
|
||||||
lexicon.close();
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,41 @@
|
|||||||
|
package nu.marginalia.loading.loader;
|
||||||
|
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import com.zaxxer.hikari.HikariDataSource;
|
||||||
|
import gnu.trove.map.hash.TObjectIntHashMap;
|
||||||
|
import nu.marginalia.model.EdgeDomain;
|
||||||
|
|
||||||
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
import static java.sql.Statement.SUCCESS_NO_INFO;
|
||||||
|
|
||||||
|
public class OldDomains {
|
||||||
|
|
||||||
|
private final TObjectIntHashMap<EdgeDomain> knownDomains = new TObjectIntHashMap<>(100_000, 0.75f, -1);
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
public OldDomains(HikariDataSource dataSource) {
|
||||||
|
try (var conn = dataSource.getConnection()) {
|
||||||
|
try (var stmt = conn.prepareStatement("""
|
||||||
|
SELECT DOMAIN_NAME, ID FROM EC_DOMAIN
|
||||||
|
"""))
|
||||||
|
{
|
||||||
|
var rs = stmt.executeQuery();
|
||||||
|
while (rs.next()) {
|
||||||
|
knownDomains.put(new EdgeDomain(rs.getString(1)), rs.getInt(2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (SQLException ex) {
|
||||||
|
throw new RuntimeException("Failed to set up loader", ex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getId(EdgeDomain domain) {
|
||||||
|
return knownDomains.get(domain);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void add(EdgeDomain domain, int id) {
|
||||||
|
knownDomains.put(domain, id);
|
||||||
|
}
|
||||||
|
}
|
@ -5,7 +5,7 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
|
||||||
public class SqlLoadDomainMetadata {
|
public class SqlLoadDomainMetadata {
|
||||||
|
@ -6,6 +6,7 @@ import nu.marginalia.model.EdgeDomain;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
import java.sql.Connection;
|
import java.sql.Connection;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
|
||||||
|
@ -4,7 +4,7 @@ import lombok.AllArgsConstructor;
|
|||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
import lombok.ToString;
|
import lombok.ToString;
|
||||||
|
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.math.RoundingMode;
|
import java.math.RoundingMode;
|
||||||
import java.text.DecimalFormat;
|
import java.text.DecimalFormat;
|
||||||
import java.text.NumberFormat;
|
import java.text.NumberFormat;
|
||||||
|
@ -3,8 +3,8 @@ package nu.marginalia.assistant.eval;
|
|||||||
import com.opencsv.CSVReader;
|
import com.opencsv.CSVReader;
|
||||||
import lombok.SneakyThrows;
|
import lombok.SneakyThrows;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.text.DecimalFormat;
|
import java.text.DecimalFormat;
|
||||||
|
@ -36,6 +36,7 @@ dependencies {
|
|||||||
implementation project(':code:api:index-api')
|
implementation project(':code:api:index-api')
|
||||||
implementation project(':code:api:process-mqapi')
|
implementation project(':code:api:process-mqapi')
|
||||||
implementation project(':code:features-search:screenshots')
|
implementation project(':code:features-search:screenshots')
|
||||||
|
implementation project(':code:features-index:index-journal')
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
|
@ -296,15 +296,6 @@ public class ConvertAndLoadActor extends AbstractActorPrototype {
|
|||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
public void switchOver(Long id) throws Exception {
|
public void switchOver(Long id) throws Exception {
|
||||||
var live = storageService.getStorageByType(FileStorageType.LEXICON_LIVE);
|
|
||||||
var staging = storageService.getStorageByType(FileStorageType.LEXICON_STAGING);
|
|
||||||
var fromSource = staging.asPath().resolve("dictionary.dat");
|
|
||||||
var liveDest = live.asPath().resolve("dictionary.dat");
|
|
||||||
|
|
||||||
// Swap in new lexicon
|
|
||||||
logger.info("Moving " + fromSource + " to " + liveDest);
|
|
||||||
Files.move(fromSource, liveDest, StandardCopyOption.REPLACE_EXISTING, StandardCopyOption.ATOMIC_MOVE);
|
|
||||||
|
|
||||||
// Notify services to switch over
|
// Notify services to switch over
|
||||||
searchOutbox.sendNotice(SearchMqEndpoints.SWITCH_LINKDB, ":-)");
|
searchOutbox.sendNotice(SearchMqEndpoints.SWITCH_LINKDB, ":-)");
|
||||||
indexOutbox.sendNotice(IndexMqEndpoints.INDEX_REINDEX, ":^D");
|
indexOutbox.sendNotice(IndexMqEndpoints.INDEX_REINDEX, ":^D");
|
||||||
|
@ -8,8 +8,8 @@ import org.slf4j.LoggerFactory;
|
|||||||
import org.slf4j.Marker;
|
import org.slf4j.Marker;
|
||||||
import org.slf4j.MarkerFactory;
|
import org.slf4j.MarkerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
|
@ -7,9 +7,10 @@ import nu.marginalia.db.storage.model.FileStorage;
|
|||||||
import nu.marginalia.db.storage.model.FileStorageBaseType;
|
import nu.marginalia.db.storage.model.FileStorageBaseType;
|
||||||
import nu.marginalia.db.storage.model.FileStorageId;
|
import nu.marginalia.db.storage.model.FileStorageId;
|
||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
|
import nu.marginallia.index.journal.IndexJournalFileNames;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
@ -38,12 +39,10 @@ public class BackupService {
|
|||||||
|
|
||||||
var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING);
|
var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING);
|
||||||
var lexiconStagingStorage = storageService.getStorageByType(FileStorageType.LEXICON_STAGING);
|
|
||||||
|
|
||||||
backupFileCompressed("links.db", linkdbStagingStorage, backupStorage);
|
backupFileCompressed("links.db", linkdbStagingStorage, backupStorage);
|
||||||
backupFileCompressed("dictionary.dat", lexiconStagingStorage, backupStorage);
|
|
||||||
// This file format is already compressed
|
// This file format is already compressed
|
||||||
backupFileNoCompression("page-index.dat", indexStagingStorage, backupStorage);
|
backupJournal(indexStagingStorage, backupStorage);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -53,29 +52,37 @@ public class BackupService {
|
|||||||
|
|
||||||
var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
var indexStagingStorage = storageService.getStorageByType(FileStorageType.INDEX_STAGING);
|
||||||
var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING);
|
var linkdbStagingStorage = storageService.getStorageByType(FileStorageType.LINKDB_STAGING);
|
||||||
var lexiconStagingStorage = storageService.getStorageByType(FileStorageType.LEXICON_STAGING);
|
|
||||||
|
|
||||||
restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage);
|
restoreBackupCompressed("links.db", linkdbStagingStorage, backupStorage);
|
||||||
restoreBackupCompressed("dictionary.dat", lexiconStagingStorage, backupStorage);
|
restoreJournal(indexStagingStorage, backupStorage);
|
||||||
restoreBackupNoCompression("page-index.dat", indexStagingStorage, backupStorage);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void backupFileNoCompression(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException
|
private void backupJournal(FileStorage inputStorage, FileStorage backupStorage) throws IOException
|
||||||
{
|
{
|
||||||
try (var is = Files.newInputStream(inputStorage.asPath().resolve(fileName));
|
for (var source : IndexJournalFileNames.findJournalFiles(inputStorage.asPath())) {
|
||||||
var os = Files.newOutputStream(backupStorage.asPath().resolve(fileName))
|
var dest = backupStorage.asPath().resolve(source.toFile().getName());
|
||||||
) {
|
|
||||||
IOUtils.copyLarge(is, os);
|
try (var is = Files.newInputStream(source);
|
||||||
|
var os = Files.newOutputStream(dest)
|
||||||
|
) {
|
||||||
|
IOUtils.copyLarge(is, os);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void restoreBackupNoCompression(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException {
|
private void restoreJournal(FileStorage destStorage, FileStorage backupStorage) throws IOException {
|
||||||
try (var is = Files.newInputStream(backupStorage.asPath().resolve(fileName));
|
for (var source : IndexJournalFileNames.findJournalFiles(backupStorage.asPath())) {
|
||||||
var os = Files.newOutputStream(inputStorage.asPath().resolve(fileName))
|
var dest = destStorage.asPath().resolve(source.toFile().getName());
|
||||||
) {
|
|
||||||
IOUtils.copyLarge(is, os);
|
try (var is = Files.newInputStream(source);
|
||||||
|
var os = Files.newOutputStream(dest)
|
||||||
|
) {
|
||||||
|
IOUtils.copyLarge(is, os);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void backupFileCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException
|
private void backupFileCompressed(String fileName, FileStorage inputStorage, FileStorage backupStorage) throws IOException
|
||||||
|
@ -35,13 +35,14 @@ dependencies {
|
|||||||
implementation project(':code:features-index:index-query')
|
implementation project(':code:features-index:index-query')
|
||||||
implementation project(':code:features-index:index-forward')
|
implementation project(':code:features-index:index-forward')
|
||||||
implementation project(':code:features-index:index-reverse')
|
implementation project(':code:features-index:index-reverse')
|
||||||
implementation project(':code:features-index:lexicon')
|
|
||||||
|
|
||||||
implementation project(':code:features-index:domain-ranking')
|
implementation project(':code:features-index:domain-ranking')
|
||||||
implementation project(':code:features-search:result-ranking')
|
implementation project(':code:features-search:result-ranking')
|
||||||
|
implementation project(':third-party:commons-codec')
|
||||||
|
|
||||||
|
|
||||||
implementation libs.lombok
|
implementation libs.lombok
|
||||||
|
testImplementation project(path: ':code:services-core:control-service')
|
||||||
|
testImplementation project(':code:common:process')
|
||||||
annotationProcessor libs.lombok
|
annotationProcessor libs.lombok
|
||||||
implementation libs.bundles.slf4j
|
implementation libs.bundles.slf4j
|
||||||
|
|
||||||
|
@ -8,10 +8,6 @@ import nu.marginalia.db.storage.FileStorageService;
|
|||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
import nu.marginalia.index.config.RankingSettings;
|
import nu.marginalia.index.config.RankingSettings;
|
||||||
import nu.marginalia.WmsaHome;
|
import nu.marginalia.WmsaHome;
|
||||||
import nu.marginalia.lexicon.KeywordLexicon;
|
|
||||||
import nu.marginalia.lexicon.KeywordLexiconReadOnlyView;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournal;
|
|
||||||
import nu.marginalia.lexicon.journal.KeywordLexiconJournalMode;
|
|
||||||
import nu.marginalia.service.control.ServiceEventLog;
|
import nu.marginalia.service.control.ServiceEventLog;
|
||||||
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
@ -23,23 +19,6 @@ public class IndexModule extends AbstractModule {
|
|||||||
public void configure() {
|
public void configure() {
|
||||||
}
|
}
|
||||||
|
|
||||||
@Provides
|
|
||||||
@SneakyThrows
|
|
||||||
@Singleton
|
|
||||||
private KeywordLexiconReadOnlyView createLexicon(ServiceEventLog eventLog, FileStorageService fileStorageService) {
|
|
||||||
try {
|
|
||||||
eventLog.logEvent("INDEX-LEXICON-LOAD-BEGIN", "");
|
|
||||||
|
|
||||||
var area = fileStorageService.getStorageByType(FileStorageType.LEXICON_LIVE);
|
|
||||||
var path = area.asPath().resolve("dictionary.dat");
|
|
||||||
|
|
||||||
return new KeywordLexiconReadOnlyView(new KeywordLexicon(new KeywordLexiconJournal(path.toFile(), KeywordLexiconJournalMode.READ_ONLY)));
|
|
||||||
}
|
|
||||||
finally {
|
|
||||||
eventLog.logEvent("INDEX-LEXICON-LOAD-OK", "");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Provides
|
@Provides
|
||||||
public RankingSettings rankingSettings() {
|
public RankingSettings rankingSettings() {
|
||||||
Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
|
Path dir = WmsaHome.getHomePath().resolve("conf/ranking-settings.yaml");
|
||||||
|
@ -77,12 +77,7 @@ public class IndexService extends Service {
|
|||||||
|
|
||||||
@MqRequest(endpoint = IndexMqEndpoints.INDEX_RELOAD_LEXICON)
|
@MqRequest(endpoint = IndexMqEndpoints.INDEX_RELOAD_LEXICON)
|
||||||
public String reloadLexicon(String message) throws Exception {
|
public String reloadLexicon(String message) throws Exception {
|
||||||
|
throw new UnsupportedOperationException();
|
||||||
if (!opsService.reloadLexicon()) {
|
|
||||||
throw new IllegalStateException("Ops lock busy");
|
|
||||||
}
|
|
||||||
|
|
||||||
return "ok";
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -4,32 +4,18 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.db.storage.FileStorageService;
|
import nu.marginalia.db.storage.FileStorageService;
|
||||||
import nu.marginalia.db.storage.model.FileStorageType;
|
import nu.marginalia.db.storage.model.FileStorageType;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||||
import nu.marginalia.index.full.ReverseIndexFullFileNames;
|
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleCompressedFile;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPrioFileNames;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityConverter;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullConverter;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
|
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityParameters;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
|
||||||
import nu.marginalia.ranking.DomainRankings;
|
|
||||||
import nu.marginalia.index.index.SearchIndexReader;
|
import nu.marginalia.index.index.SearchIndexReader;
|
||||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
import org.checkerframework.checker.units.qual.C;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.StandardCopyOption;
|
import java.nio.file.StandardCopyOption;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.concurrent.Callable;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexServicesFactory {
|
public class IndexServicesFactory {
|
||||||
@ -55,16 +41,16 @@ public class IndexServicesFactory {
|
|||||||
return searchSetsBase;
|
return searchSetsBase;
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReverseIndexFullReader getReverseIndexReader() throws IOException {
|
public ReverseIndexReader getReverseIndexReader() throws IOException {
|
||||||
|
|
||||||
return new ReverseIndexFullReader(
|
return new ReverseIndexReader(
|
||||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT)
|
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReverseIndexPriorityReader getReverseIndexPrioReader() throws IOException {
|
public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
||||||
return new ReverseIndexPriorityReader(
|
return new ReverseIndexReader(
|
||||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
||||||
);
|
);
|
||||||
|
@ -5,8 +5,8 @@ import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
|
|||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import javax.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import javax.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
|
@ -107,8 +107,8 @@ public class SearchIndex {
|
|||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
final int[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
|
final long[] orderedIncludes = terms.sortedDistinctIncludes(this::compareKeywords);
|
||||||
final int[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
|
final long[] orderedIncludesPrio = terms.sortedDistinctIncludes(this::compareKeywordsPrio);
|
||||||
|
|
||||||
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
List<IndexQueryBuilder> queryHeads = new ArrayList<>(10);
|
||||||
List<IndexQuery> queries = new ArrayList<>(10);
|
List<IndexQuery> queries = new ArrayList<>(10);
|
||||||
@ -146,11 +146,11 @@ public class SearchIndex {
|
|||||||
return Collections.emptyList();
|
return Collections.emptyList();
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int orderedInclude : orderedIncludes) {
|
for (long orderedInclude : orderedIncludes) {
|
||||||
query = query.alsoFull(orderedInclude);
|
query = query.alsoFull(orderedInclude);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int term : terms.excludes()) {
|
for (long term : terms.excludes()) {
|
||||||
query = query.notFull(term);
|
query = query.notFull(term);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,14 +166,14 @@ public class SearchIndex {
|
|||||||
return queries;
|
return queries;
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareKeywords(int a, int b) {
|
private int compareKeywords(long a, long b) {
|
||||||
return Long.compare(
|
return Long.compare(
|
||||||
indexReader.numHits(a),
|
indexReader.numHits(a),
|
||||||
indexReader.numHits(b)
|
indexReader.numHits(b)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
private int compareKeywordsPrio(int a, int b) {
|
private int compareKeywordsPrio(long a, long b) {
|
||||||
return Long.compare(
|
return Long.compare(
|
||||||
indexReader.numHitsPrio(a),
|
indexReader.numHitsPrio(a),
|
||||||
indexReader.numHitsPrio(b)
|
indexReader.numHitsPrio(b)
|
||||||
@ -184,7 +184,7 @@ public class SearchIndex {
|
|||||||
* document identifiers provided; with metadata for termId. The input array
|
* document identifiers provided; with metadata for termId. The input array
|
||||||
* docs[] *must* be sorted.
|
* docs[] *must* be sorted.
|
||||||
*/
|
*/
|
||||||
public long[] getTermMetadata(int termId, long[] docs) {
|
public long[] getTermMetadata(long termId, long[] docs) {
|
||||||
return indexReader.getMetadata(termId, docs);
|
return indexReader.getMetadata(termId, docs);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,10 +199,10 @@ public class SearchIndex {
|
|||||||
return indexReader.totalDocCount();
|
return indexReader.totalDocCount();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getTermFrequency(int id) {
|
public int getTermFrequency(long id) {
|
||||||
return (int) indexReader.numHits(id);
|
return (int) indexReader.numHits(id);
|
||||||
}
|
}
|
||||||
public int getTermFrequencyPrio(int id) {
|
public int getTermFrequencyPrio(long id) {
|
||||||
return (int) indexReader.numHitsPrio(id);
|
return (int) indexReader.numHitsPrio(id);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1,19 +1,15 @@
|
|||||||
package nu.marginalia.index.index;
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
import gnu.trove.set.hash.TIntHashSet;
|
import gnu.trove.set.hash.TLongHashSet;
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
|
import nu.marginalia.index.ReverseIndexReader;
|
||||||
import nu.marginalia.index.query.IndexQuery;
|
import nu.marginalia.index.query.IndexQuery;
|
||||||
import nu.marginalia.index.query.IndexQueryBuilder;
|
import nu.marginalia.index.query.IndexQueryBuilder;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
|
||||||
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.IntStream;
|
|
||||||
|
|
||||||
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
||||||
private final IndexQuery query;
|
private final IndexQuery query;
|
||||||
private final ReverseIndexFullReader reverseIndexFullReader;
|
private final ReverseIndexReader reverseIndexFullReader;
|
||||||
private final ReverseIndexPriorityReader reverseIndexPrioReader;
|
private final ReverseIndexReader reverseIndexPrioReader;
|
||||||
|
|
||||||
/* Keep track of already added include terms to avoid redundant checks.
|
/* Keep track of already added include terms to avoid redundant checks.
|
||||||
*
|
*
|
||||||
@ -21,11 +17,11 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
|||||||
* first check one index and then another for the same term. At the moment, that
|
* first check one index and then another for the same term. At the moment, that
|
||||||
* makes no sense, but in the future, that might be a thing one might want to do.
|
* makes no sense, but in the future, that might be a thing one might want to do.
|
||||||
* */
|
* */
|
||||||
private final TIntHashSet alreadyConsideredTerms = new TIntHashSet();
|
private final TLongHashSet alreadyConsideredTerms = new TLongHashSet();
|
||||||
|
|
||||||
SearchIndexQueryBuilder(ReverseIndexFullReader reverseIndexFullReader,
|
SearchIndexQueryBuilder(ReverseIndexReader reverseIndexFullReader,
|
||||||
ReverseIndexPriorityReader reverseIndexPrioReader,
|
ReverseIndexReader reverseIndexPrioReader,
|
||||||
IndexQuery query, int... sourceTerms)
|
IndexQuery query, long... sourceTerms)
|
||||||
{
|
{
|
||||||
this.query = query;
|
this.query = query;
|
||||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||||
@ -34,7 +30,7 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
|||||||
alreadyConsideredTerms.addAll(sourceTerms);
|
alreadyConsideredTerms.addAll(sourceTerms);
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder alsoFull(int termId) {
|
public IndexQueryBuilder alsoFull(long termId) {
|
||||||
|
|
||||||
if (alreadyConsideredTerms.add(termId)) {
|
if (alreadyConsideredTerms.add(termId)) {
|
||||||
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
query.addInclusionFilter(reverseIndexFullReader.also(termId));
|
||||||
@ -43,7 +39,7 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder alsoPrio(int termId) {
|
public IndexQueryBuilder alsoPrio(long termId) {
|
||||||
|
|
||||||
if (alreadyConsideredTerms.add(termId)) {
|
if (alreadyConsideredTerms.add(termId)) {
|
||||||
query.addInclusionFilter(reverseIndexPrioReader.also(termId));
|
query.addInclusionFilter(reverseIndexPrioReader.also(termId));
|
||||||
@ -52,7 +48,7 @@ public class SearchIndexQueryBuilder implements IndexQueryBuilder {
|
|||||||
return this;
|
return this;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder notFull(int termId) {
|
public IndexQueryBuilder notFull(long termId) {
|
||||||
|
|
||||||
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
query.addInclusionFilter(reverseIndexFullReader.not(termId));
|
||||||
|
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
package nu.marginalia.index.index;
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
|
import nu.marginalia.index.ReverseIndexReader;
|
||||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||||
import nu.marginalia.index.forward.ParamMatchingQueryFilter;
|
import nu.marginalia.index.forward.ParamMatchingQueryFilter;
|
||||||
import nu.marginalia.index.query.*;
|
import nu.marginalia.index.query.*;
|
||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import nu.marginalia.index.priority.ReverseIndexPriorityReader;
|
|
||||||
import nu.marginalia.index.full.ReverseIndexFullReader;
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -16,25 +15,25 @@ public class SearchIndexReader {
|
|||||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||||
|
|
||||||
private final ForwardIndexReader forwardIndexReader;
|
private final ForwardIndexReader forwardIndexReader;
|
||||||
private final ReverseIndexFullReader reverseIndexFullReader;
|
private final ReverseIndexReader reverseIndexFullReader;
|
||||||
private final ReverseIndexPriorityReader reverseIndexPriorityReader;
|
private final ReverseIndexReader reverseIndexPriorityReader;
|
||||||
|
|
||||||
public SearchIndexReader(ForwardIndexReader forwardIndexReader,
|
public SearchIndexReader(ForwardIndexReader forwardIndexReader,
|
||||||
ReverseIndexFullReader reverseIndexFullReader,
|
ReverseIndexReader reverseIndexFullReader,
|
||||||
ReverseIndexPriorityReader reverseIndexPriorityReader) {
|
ReverseIndexReader reverseIndexPriorityReader) {
|
||||||
this.forwardIndexReader = forwardIndexReader;
|
this.forwardIndexReader = forwardIndexReader;
|
||||||
this.reverseIndexFullReader = reverseIndexFullReader;
|
this.reverseIndexFullReader = reverseIndexFullReader;
|
||||||
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
this.reverseIndexPriorityReader = reverseIndexPriorityReader;
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder findPriorityWord(IndexQueryPriority priority, int wordId, int fetchSizeMultiplier) {
|
public IndexQueryBuilder findPriorityWord(IndexQueryPriority priority, long wordId, int fetchSizeMultiplier) {
|
||||||
var sources = List.of(reverseIndexPriorityReader.priorityDocuments(wordId));
|
var sources = List.of(reverseIndexPriorityReader.documents(wordId));
|
||||||
|
|
||||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader,
|
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader,
|
||||||
new IndexQuery(sources, priority, fetchSizeMultiplier), wordId);
|
new IndexQuery(sources, priority, fetchSizeMultiplier), wordId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public IndexQueryBuilder findFullWord(IndexQueryPriority priority, int wordId, int fetchSizeMultiplier) {
|
public IndexQueryBuilder findFullWord(IndexQueryPriority priority, long wordId, int fetchSizeMultiplier) {
|
||||||
var sources = List.of(reverseIndexFullReader.documents(wordId));
|
var sources = List.of(reverseIndexFullReader.documents(wordId));
|
||||||
|
|
||||||
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader,
|
return new SearchIndexQueryBuilder(reverseIndexFullReader, reverseIndexPriorityReader,
|
||||||
@ -45,14 +44,14 @@ public class SearchIndexReader {
|
|||||||
return new ParamMatchingQueryFilter(params, forwardIndexReader);
|
return new ParamMatchingQueryFilter(params, forwardIndexReader);
|
||||||
}
|
}
|
||||||
|
|
||||||
public long numHits(int word) {
|
public long numHits(long word) {
|
||||||
return reverseIndexFullReader.numDocuments(word);
|
return reverseIndexFullReader.numDocuments(word);
|
||||||
}
|
}
|
||||||
public long numHitsPrio(int word) {
|
public long numHitsPrio(long word) {
|
||||||
return reverseIndexPriorityReader.numDocuments(word);
|
return reverseIndexPriorityReader.numDocuments(word);
|
||||||
}
|
}
|
||||||
|
|
||||||
public long[] getMetadata(int wordId, long[] docIds) {
|
public long[] getMetadata(long wordId, long[] docIds) {
|
||||||
return reverseIndexFullReader.getTermMeta(wordId, docIds);
|
return reverseIndexFullReader.getTermMeta(wordId, docIds);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,35 +1,35 @@
|
|||||||
package nu.marginalia.index.index;
|
package nu.marginalia.index.index;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntComparator;
|
import it.unimi.dsi.fastutil.longs.LongComparator;
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.longs.LongList;
|
||||||
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
|
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||||
|
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public record SearchIndexSearchTerms(
|
public record SearchIndexSearchTerms(
|
||||||
IntList includes,
|
LongList includes,
|
||||||
IntList excludes,
|
LongList excludes,
|
||||||
IntList priority,
|
LongList priority,
|
||||||
List<IntList> coherences
|
List<LongList> coherences
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
public SearchIndexSearchTerms() {
|
public SearchIndexSearchTerms() {
|
||||||
this(IntList.of(), IntList.of(), IntList.of(), Collections.emptyList());
|
this(LongList.of(), LongList.of(), LongList.of(), Collections.emptyList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isEmpty() {
|
public boolean isEmpty() {
|
||||||
return includes.isEmpty();
|
return includes.isEmpty();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int[] sortedDistinctIncludes(IntComparator comparator) {
|
public long[] sortedDistinctIncludes(LongComparator comparator) {
|
||||||
if (includes.isEmpty())
|
if (includes.isEmpty())
|
||||||
return includes.toIntArray();
|
return includes.toLongArray();
|
||||||
|
|
||||||
IntList list = new IntArrayList(new IntOpenHashSet(includes));
|
LongList list = new LongArrayList(new LongOpenHashSet(includes));
|
||||||
list.sort(comparator);
|
list.sort(comparator);
|
||||||
return list.toIntArray();
|
return list.toLongArray();
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
public int size() {
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user