mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index) Integrate positions data with indexes WIP
This change integrates the new positions data with the forward and reverse indexes. The ranking code is still only partially re-written.
This commit is contained in:
parent
9f982a0c3d
commit
36160988e2
@ -5,8 +5,8 @@ import java.util.stream.IntStream;
|
||||
|
||||
/** A compiled index service query */
|
||||
public class CompiledQueryInt {
|
||||
private final CqExpression root;
|
||||
private final CqDataInt data;
|
||||
public final CqExpression root;
|
||||
public final CqDataInt data;
|
||||
|
||||
public CompiledQueryInt(CqExpression root, CqDataInt data) {
|
||||
this.root = root;
|
||||
@ -26,7 +26,7 @@ public class CompiledQueryInt {
|
||||
return IntStream.range(0, data.size());
|
||||
}
|
||||
|
||||
public long at(int index) {
|
||||
public int at(int index) {
|
||||
return data.get(index);
|
||||
}
|
||||
|
||||
|
@ -61,7 +61,8 @@ public class CompiledQueryParser {
|
||||
|
||||
String[] cqData = new String[wordIds.size()];
|
||||
wordIds.forEach((w, i) -> cqData[i] = w);
|
||||
return new CompiledQuery<>(root, new CqData<>(cqData));
|
||||
|
||||
return root.newQuery(cqData);
|
||||
|
||||
}
|
||||
|
||||
|
@ -8,6 +8,18 @@ import java.util.stream.Stream;
|
||||
*
|
||||
*/
|
||||
public sealed interface CqExpression {
|
||||
/** Create a new query for the provided data using this expression as the root */
|
||||
default <T> CompiledQuery<T> newQuery(T[] data) {
|
||||
return new CompiledQuery<>(this, data);
|
||||
}
|
||||
/** Create a new query for the provided data using this expression as the root */
|
||||
default CompiledQueryInt newQuery(int[] data) {
|
||||
return new CompiledQueryInt(this, new CqDataInt(data));
|
||||
}
|
||||
/** Create a new query for the provided data using this expression as the root */
|
||||
default CompiledQueryLong newQuery(long[] data) {
|
||||
return new CompiledQueryLong(this, new CqDataLong(data));
|
||||
}
|
||||
|
||||
Stream<Word> stream();
|
||||
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
|
||||
import java.util.ArrayList;
|
||||
@ -36,7 +37,10 @@ public class CompiledQueryAggregates {
|
||||
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
}
|
||||
|
||||
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||
public static <T> int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
}
|
||||
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
|
||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
|
||||
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
|
||||
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||
}
|
||||
|
||||
public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) {
|
||||
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||
}
|
||||
@Override
|
||||
public int onAnd(List<? extends CqExpression> parts) {
|
||||
int value = parts.getFirst().visit(this);
|
||||
|
@ -36,6 +36,10 @@ public class SearchQuery {
|
||||
@Deprecated // why does this exist?
|
||||
private double value = 0;
|
||||
|
||||
public static SearchQueryBuilder builder(String compiledQuery) {
|
||||
return new SearchQueryBuilder(compiledQuery);
|
||||
}
|
||||
|
||||
public SearchQuery() {
|
||||
this.compiledQuery = "";
|
||||
this.searchTermsInclude = new ArrayList<>();
|
||||
@ -81,5 +85,45 @@ public class SearchQuery {
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static class SearchQueryBuilder {
|
||||
private final String compiledQuery;
|
||||
private List<String> searchTermsInclude = new ArrayList<>();
|
||||
private List<String> searchTermsExclude = new ArrayList<>();
|
||||
private List<String> searchTermsAdvice = new ArrayList<>();
|
||||
private List<String> searchTermsPriority = new ArrayList<>();
|
||||
private List<List<String>> searchTermCoherences = new ArrayList<>();
|
||||
|
||||
private SearchQueryBuilder(String compiledQuery) {
|
||||
this.compiledQuery = compiledQuery;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder include(String... terms) {
|
||||
searchTermsInclude.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder exclude(String... terms) {
|
||||
searchTermsExclude.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder advice(String... terms) {
|
||||
searchTermsAdvice.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder priority(String... terms) {
|
||||
searchTermsPriority.addAll(List.of(terms));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQueryBuilder coherences(String... coherences) {
|
||||
searchTermCoherences.add(List.of(coherences));
|
||||
return this;
|
||||
}
|
||||
|
||||
public SearchQuery build() {
|
||||
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -32,13 +32,11 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
||||
|
||||
public SearchResultItem(long combinedId,
|
||||
long encodedDocMetadata,
|
||||
int htmlFeatures,
|
||||
boolean hasPrioTerm) {
|
||||
int htmlFeatures) {
|
||||
this.combinedId = combinedId;
|
||||
this.encodedDocMetadata = encodedDocMetadata;
|
||||
this.keywordScores = new ArrayList<>();
|
||||
this.htmlFeatures = htmlFeatures;
|
||||
this.hasPrioTerm = hasPrioTerm;
|
||||
}
|
||||
|
||||
|
||||
|
@ -83,8 +83,10 @@ public class ForwardIndexConverter {
|
||||
int ranking = domainRankings.getRanking(domainId);
|
||||
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
|
||||
|
||||
long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L);
|
||||
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures());
|
||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features);
|
||||
}
|
||||
|
||||
progress.progress(TaskSteps.FORCE);
|
||||
|
@ -82,9 +82,19 @@ public class ForwardIndexReader {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
||||
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
||||
return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) & 0xFFFF_FFFFL);
|
||||
}
|
||||
|
||||
public int getDocumentSize(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return 0;
|
||||
|
||||
return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) >>> 32L);
|
||||
}
|
||||
|
||||
|
||||
private int idxForDoc(long docId) {
|
||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||
|
||||
|
@ -79,6 +79,7 @@ class ForwardIndexConverterTest {
|
||||
writer.put(
|
||||
new IndexJournalEntryHeader(createId(id, id/20),
|
||||
id%3,
|
||||
15,
|
||||
(id % 5)),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{},
|
||||
|
@ -17,14 +17,17 @@ import nu.marginalia.model.idx.DocumentMetadata;
|
||||
*/
|
||||
public record IndexJournalEntryHeader(int entrySize,
|
||||
int documentFeatures,
|
||||
int documentSize,
|
||||
long combinedId,
|
||||
long documentMeta) {
|
||||
|
||||
public IndexJournalEntryHeader(long combinedId,
|
||||
int documentFeatures,
|
||||
int documentSize,
|
||||
long documentMeta) {
|
||||
this(-1,
|
||||
documentFeatures,
|
||||
documentSize,
|
||||
combinedId,
|
||||
documentMeta);
|
||||
}
|
||||
|
@ -28,12 +28,17 @@ public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData
|
||||
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
|
||||
|
||||
final long sizeBlock = inputStream.readLong();
|
||||
final int entrySize = (int) (sizeBlock >>> 48L);
|
||||
final int docSize = (int) ((sizeBlock >>> 32L) & 0xFFFFL);
|
||||
final int docFeatures = (int) (sizeBlock & 0xFFFF_FFFFL);
|
||||
final long docId = inputStream.readLong();
|
||||
final long meta = inputStream.readLong();
|
||||
|
||||
|
||||
var header = new IndexJournalEntryHeader(
|
||||
(int) (sizeBlock >>> 32L),
|
||||
(int) (sizeBlock & 0xFFFF_FFFFL),
|
||||
entrySize,
|
||||
docFeatures,
|
||||
docSize,
|
||||
docId,
|
||||
meta);
|
||||
|
||||
@ -57,6 +62,10 @@ public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData
|
||||
return header.documentFeatures();
|
||||
}
|
||||
|
||||
public int documentSize() {
|
||||
return header.documentSize();
|
||||
}
|
||||
|
||||
public int domainId() {
|
||||
return UrlIdCodec.getDomainId(docId());
|
||||
}
|
||||
@ -88,7 +97,7 @@ class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
|
||||
public IndexJournalEntryTermData next() {
|
||||
// read the metadata for the term
|
||||
long termId = buffer.getLong();
|
||||
long meta = buffer.getLong();
|
||||
long meta = buffer.getShort();
|
||||
|
||||
// read the size of the sequence data
|
||||
int size = buffer.get() & 0xFF;
|
||||
|
@ -13,7 +13,7 @@ public interface IndexJournalReader {
|
||||
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
||||
|
||||
int DOCUMENT_HEADER_SIZE_BYTES = 24;
|
||||
int TERM_HEADER_SIZE_BYTES = 17;
|
||||
int TERM_HEADER_SIZE_BYTES = 11;
|
||||
|
||||
/** Create a reader for a single file. */
|
||||
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
||||
|
@ -97,6 +97,9 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
||||
@Override
|
||||
public int documentFeatures() { return entry.documentFeatures(); }
|
||||
|
||||
@Override
|
||||
public int documentSize() { return entry.documentSize(); }
|
||||
|
||||
/** Return an iterator over the terms in the current document.
|
||||
* This iterator is not valid after calling nextDocument().
|
||||
*/
|
||||
|
@ -42,6 +42,8 @@ public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>
|
||||
*/
|
||||
int documentFeatures();
|
||||
|
||||
int documentSize();
|
||||
|
||||
/** Concatenate a number of journal pointers */
|
||||
static IndexJournalPointer concatenate(IndexJournalPointer... pointers) {
|
||||
if (pointers.length == 1)
|
||||
@ -94,6 +96,11 @@ class JoiningJournalPointer implements IndexJournalPointer {
|
||||
return pointers[pIndex].documentFeatures();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int documentSize() {
|
||||
return pointers[pIndex].documentSize();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||
@ -146,6 +153,12 @@ class FilteringJournalPointer implements IndexJournalPointer {
|
||||
return base.documentFeatures();
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int documentSize() {
|
||||
return base.documentSize();
|
||||
}
|
||||
|
||||
@NotNull
|
||||
@Override
|
||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.journal.writer;
|
||||
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
|
@ -81,12 +81,6 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
public int put(IndexJournalEntryHeader header,
|
||||
IndexJournalEntryData data)
|
||||
{
|
||||
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
||||
dataBuffer.flip();
|
||||
compressingStream.compress(dataBuffer);
|
||||
dataBuffer.clear();
|
||||
}
|
||||
|
||||
final long[] keywords = data.termIds();
|
||||
final long[] metadata = data.metadata();
|
||||
final var positions = data.positions();
|
||||
@ -94,16 +88,30 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
int recordSize = 0; // document header size is 3 longs
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
// term header size is 2 longs
|
||||
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
|
||||
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize();
|
||||
}
|
||||
|
||||
dataBuffer.putInt(recordSize);
|
||||
if (recordSize > Short.MAX_VALUE) {
|
||||
// This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file
|
||||
// (32 KB is *a lot* of data for a single document, larger than the uncompressed HTML of most documents)
|
||||
logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", recordSize, Short.MAX_VALUE);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
||||
dataBuffer.flip();
|
||||
compressingStream.compress(dataBuffer);
|
||||
dataBuffer.clear();
|
||||
}
|
||||
|
||||
dataBuffer.putShort((short) recordSize);
|
||||
dataBuffer.putShort((short) Math.clamp(0, header.documentSize(), Short.MAX_VALUE));
|
||||
dataBuffer.putInt(header.documentFeatures());
|
||||
dataBuffer.putLong(header.combinedId());
|
||||
dataBuffer.putLong(header.documentMeta());
|
||||
|
||||
for (int i = 0; i < keywords.length; i++) {
|
||||
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
|
||||
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize();
|
||||
|
||||
if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) {
|
||||
dataBuffer.flip();
|
||||
@ -112,8 +120,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
||||
}
|
||||
|
||||
dataBuffer.putLong(keywords[i]);
|
||||
dataBuffer.putLong(metadata[i]);
|
||||
dataBuffer.put((byte) positions[i].size());
|
||||
dataBuffer.putShort((short) metadata[i]);
|
||||
dataBuffer.put((byte) positions[i].bufferSize());
|
||||
dataBuffer.put(positions[i].buffer());
|
||||
}
|
||||
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.index.journal;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
@ -8,6 +10,11 @@ import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
@ -18,8 +25,9 @@ import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
@ -52,7 +60,7 @@ public class IndexJournalWriterTest {
|
||||
public void testSingleFile() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
// Write two documents with two terms each
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
@ -61,7 +69,7 @@ public class IndexJournalWriterTest {
|
||||
gcs(2, 4, 6),
|
||||
})
|
||||
);
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{45, 56},
|
||||
@ -90,6 +98,7 @@ public class IndexJournalWriterTest {
|
||||
assertEquals(11, ptr.documentId());
|
||||
assertEquals(22, ptr.documentFeatures());
|
||||
assertEquals(33, ptr.documentMeta());
|
||||
assertEquals(10, ptr.documentSize());
|
||||
|
||||
iter = ptr.iterator();
|
||||
|
||||
@ -116,6 +125,7 @@ public class IndexJournalWriterTest {
|
||||
assertEquals(12, ptr.documentId());
|
||||
assertEquals(23, ptr.documentFeatures());
|
||||
assertEquals(34, ptr.documentMeta());
|
||||
assertEquals(11, ptr.documentSize());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
@ -147,7 +157,7 @@ public class IndexJournalWriterTest {
|
||||
@Test
|
||||
public void testMultiFile() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
@ -162,7 +172,7 @@ public class IndexJournalWriterTest {
|
||||
}
|
||||
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{45, 56},
|
||||
@ -191,6 +201,7 @@ public class IndexJournalWriterTest {
|
||||
assertEquals(11, ptr.documentId());
|
||||
assertEquals(22, ptr.documentFeatures());
|
||||
assertEquals(33, ptr.documentMeta());
|
||||
assertEquals(10, ptr.documentSize());
|
||||
|
||||
iter = ptr.iterator();
|
||||
|
||||
@ -217,6 +228,7 @@ public class IndexJournalWriterTest {
|
||||
assertEquals(12, ptr.documentId());
|
||||
assertEquals(23, ptr.documentFeatures());
|
||||
assertEquals(34, ptr.documentMeta());
|
||||
assertEquals(11, ptr.documentSize());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
@ -249,7 +261,7 @@ public class IndexJournalWriterTest {
|
||||
public void testSingleFileIterTwice() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
// Write two documents with two terms each
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
@ -277,6 +289,7 @@ public class IndexJournalWriterTest {
|
||||
assertTrue(ptr.nextDocument());
|
||||
assertEquals(11, ptr.documentId());
|
||||
assertEquals(22, ptr.documentFeatures());
|
||||
assertEquals(10, ptr.documentSize());
|
||||
assertEquals(33, ptr.documentMeta());
|
||||
|
||||
iter = ptr.iterator();
|
||||
@ -307,7 +320,7 @@ public class IndexJournalWriterTest {
|
||||
public void testFiltered() {
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
// Write two documents with two terms each
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
||||
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{44, 55},
|
||||
@ -316,7 +329,7 @@ public class IndexJournalWriterTest {
|
||||
gcs(2, 4, 6),
|
||||
})
|
||||
);
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
||||
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
|
||||
new IndexJournalEntryData(
|
||||
new String[]{"word1", "word2"},
|
||||
new long[]{45, 56},
|
||||
@ -344,6 +357,7 @@ public class IndexJournalWriterTest {
|
||||
assertEquals(12, ptr.documentId());
|
||||
assertEquals(23, ptr.documentFeatures());
|
||||
assertEquals(34, ptr.documentMeta());
|
||||
assertEquals(11, ptr.documentSize());
|
||||
|
||||
iter = ptr.iterator();
|
||||
// Term 1
|
||||
@ -364,4 +378,72 @@ public class IndexJournalWriterTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testIntegrationScenario() throws IOException {
|
||||
Map<Long, Integer> wordMap = new HashMap<>();
|
||||
for (int i = 0; i < 512; i++) {
|
||||
wordMap.put(hasher.hashKeyword(Integer.toString(i)), i);
|
||||
}
|
||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||
for (int idc = 1; idc < 512; idc++) {
|
||||
int id = idc;
|
||||
int[] factors = IntStream
|
||||
.rangeClosed(1, id)
|
||||
.filter(v -> (id % v) == 0)
|
||||
.toArray();
|
||||
|
||||
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
|
||||
|
||||
long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
|
||||
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
long[] metadata = new long[factors.length];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||
ByteBuffer wa = ByteBuffer.allocate(16);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions[i] = GammaCodedSequence.generate(wa, i + 1);
|
||||
}
|
||||
|
||||
writer.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||
}
|
||||
}
|
||||
|
||||
try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) {
|
||||
while (ptr.nextDocument()) {
|
||||
int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId());
|
||||
System.out.println(ordinal);
|
||||
|
||||
var expectedFactors =
|
||||
new LongArrayList(IntStream
|
||||
.rangeClosed(1, ordinal)
|
||||
.filter(v -> (ordinal % v) == 0)
|
||||
.mapToObj(Integer::toString)
|
||||
.mapToLong(hasher::hashKeyword)
|
||||
.toArray());
|
||||
|
||||
LongList foundIds = new LongArrayList();
|
||||
|
||||
var iter = ptr.iterator();
|
||||
while (iter.hasNext()) {
|
||||
var termData = iter.next();
|
||||
foundIds.add(termData.termId());
|
||||
}
|
||||
|
||||
if (!expectedFactors.equals(foundIds)) {
|
||||
System.out.println("Found: ");
|
||||
System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
|
||||
System.out.println("Expected: ");
|
||||
System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
|
||||
fail();
|
||||
}
|
||||
assertEquals(expectedFactors, foundIds);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -3,6 +3,8 @@ package nu.marginalia.index;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.btree.BTreeReader;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.index.query.EmptyEntrySource;
|
||||
import nu.marginalia.index.query.EntrySource;
|
||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||
@ -14,9 +16,9 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.Arrays;
|
||||
import java.util.concurrent.Executors;
|
||||
|
||||
public class ReverseIndexReader {
|
||||
@ -27,9 +29,16 @@ public class ReverseIndexReader {
|
||||
private final BTreeReader wordsBTreeReader;
|
||||
private final String name;
|
||||
|
||||
public ReverseIndexReader(String name, Path words, Path documents) throws IOException {
|
||||
private final PositionsFileReader positionsFileReader;
|
||||
|
||||
public ReverseIndexReader(String name,
|
||||
Path words,
|
||||
Path documents,
|
||||
PositionsFileReader positionsFileReader) throws IOException {
|
||||
this.name = name;
|
||||
|
||||
this.positionsFileReader = positionsFileReader;
|
||||
|
||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||
this.words = null;
|
||||
this.documents = null;
|
||||
@ -133,31 +142,29 @@ public class ReverseIndexReader {
|
||||
offset);
|
||||
}
|
||||
|
||||
public long[] getTermMeta(long termId, long[] docIds) {
|
||||
public TermData[] getTermData(Arena arena,
|
||||
long termId,
|
||||
long[] docIds)
|
||||
{
|
||||
var ret = new TermData[docIds.length];
|
||||
|
||||
long offset = wordOffset(termId);
|
||||
|
||||
if (offset < 0) {
|
||||
// This is likely a bug in the code, but we can't throw an exception here
|
||||
logger.debug("Missing offset for word {}", termId);
|
||||
return new long[docIds.length];
|
||||
return ret;
|
||||
}
|
||||
|
||||
assert isUniqueAndSorted(docIds) : "The input array docIds is assumed to be unique and sorted, was " + Arrays.toString(docIds);
|
||||
|
||||
var reader = createReaderNew(offset);
|
||||
return reader.queryData(docIds, 1);
|
||||
}
|
||||
|
||||
private boolean isUniqueAndSorted(long[] ids) {
|
||||
if (ids.length == 0)
|
||||
return true;
|
||||
// Read the size and offset of the position data
|
||||
var offsets = reader.queryData(docIds, 1);
|
||||
|
||||
for (int i = 1; i < ids.length; i++) {
|
||||
if(ids[i] <= ids[i-1])
|
||||
return false;
|
||||
for (int i = 0; i < docIds.length; i++) {
|
||||
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
|
||||
}
|
||||
|
||||
return true;
|
||||
return ret;
|
||||
}
|
||||
|
||||
public void close() {
|
||||
@ -166,5 +173,14 @@ public class ReverseIndexReader {
|
||||
|
||||
if (words != null)
|
||||
words.close();
|
||||
|
||||
if (positionsFileReader != null) {
|
||||
try {
|
||||
positionsFileReader.close();
|
||||
} catch (IOException e) {
|
||||
logger.error("Failed to close positions file reader", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -1,5 +1,6 @@
|
||||
package nu.marginalia.index.construction;
|
||||
|
||||
import nu.marginalia.index.positions.PositionCodec;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -38,7 +39,7 @@ public class PositionsFileConstructor implements AutoCloseable {
|
||||
/** Add a term to the positions file
|
||||
* @param termMeta the term metadata
|
||||
* @param positions the positions of the term
|
||||
* @return the offset of the term in the file
|
||||
* @return the offset of the term in the file, with the size of the data in the highest byte
|
||||
*/
|
||||
public long add(byte termMeta, GammaCodedSequence positions) throws IOException {
|
||||
synchronized (file) {
|
||||
@ -53,12 +54,20 @@ public class PositionsFileConstructor implements AutoCloseable {
|
||||
workBuffer.put(termMeta);
|
||||
workBuffer.put(positionBuffer);
|
||||
|
||||
long ret = PositionCodec.encode(size, offset);
|
||||
|
||||
offset += size;
|
||||
return offset;
|
||||
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
while (workBuffer.position() < workBuffer.limit()) {
|
||||
workBuffer.flip();
|
||||
channel.write(workBuffer);
|
||||
}
|
||||
|
||||
channel.force(false);
|
||||
channel.close();
|
||||
}
|
||||
|
@ -7,7 +7,6 @@ import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
@ -21,12 +21,14 @@ import java.util.concurrent.TimeUnit;
|
||||
* the associated ReversePreindexWordSegments data
|
||||
*/
|
||||
public class ReversePreindexDocuments {
|
||||
public final LongArray documents;
|
||||
|
||||
private static PositionsFileConstructor positionsFileConstructor;
|
||||
final Path file;
|
||||
public final LongArray documents;
|
||||
private static final int RECORD_SIZE_LONGS = 2;
|
||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
||||
|
||||
public final Path file;
|
||||
|
||||
public ReversePreindexDocuments(LongArray documents, Path file) {
|
||||
this.documents = documents;
|
||||
this.file = file;
|
||||
@ -70,22 +72,25 @@ public class ReversePreindexDocuments {
|
||||
|
||||
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||
|
||||
try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs)) {
|
||||
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
|
||||
var pointer = reader.newPointer())
|
||||
{
|
||||
|
||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||
offsetMap.defaultReturnValue(0);
|
||||
|
||||
var pointer = reader.newPointer();
|
||||
while (pointer.nextDocument()) {
|
||||
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
|
||||
for (var termData : pointer) {
|
||||
long termId = termData.termId();
|
||||
|
||||
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||
long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
|
||||
|
||||
// write position data to the positions file and get the offset
|
||||
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
|
||||
|
||||
assembly.put(offset + 0, rankEncodedId);
|
||||
assembly.put(offset + 1, posOffset);
|
||||
assembly.put(offset + 1, encodedPosOffset);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -0,0 +1,25 @@
|
||||
package nu.marginalia.index.positions;
|
||||
|
||||
/** A utility class for encoding and decoding position data offsets,
|
||||
* the data is encoded by using the highest 16 bits to store the offset,
|
||||
* and the remaining 48 bits to store the size of the data.
|
||||
* <p></p>
|
||||
* This lets us address 256 TB of data, with up to 64 KB of position data for each term,
|
||||
* which is ample headroom for both the size of the data and the number of positions.
|
||||
* */
|
||||
public class PositionCodec {
|
||||
|
||||
public static long encode(int length, long offset) {
|
||||
assert decodeSize(offset) == 0 : "Offset must be less than 2^48";
|
||||
|
||||
return (long) length << 48 | offset;
|
||||
}
|
||||
|
||||
public static int decodeSize(long sizeEncodedOffset) {
|
||||
return (int) ((sizeEncodedOffset & 0xFFFF_0000_0000_0000L) >>> 48);
|
||||
}
|
||||
public static long decodeOffset(long sizeEncodedOffset) {
|
||||
return sizeEncodedOffset & 0x0000_FFFF_FFFF_FFFFL;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,39 @@
|
||||
package nu.marginalia.index.positions;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
|
||||
public class PositionsFileReader implements AutoCloseable {
|
||||
private final FileChannel positions;
|
||||
|
||||
public PositionsFileReader(Path positionsFile) throws IOException {
|
||||
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
/** Get the positions for a term in the index, as pointed out by the encoded offset;
|
||||
* intermediate buffers are allocated from the provided arena allocator. */
|
||||
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
|
||||
int length = PositionCodec.decodeSize(sizeEncodedOffset);
|
||||
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
|
||||
|
||||
var segment = arena.allocate(length);
|
||||
var buffer = segment.asByteBuffer();
|
||||
|
||||
try {
|
||||
positions.read(buffer, offset);
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
|
||||
return new TermData(buffer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() throws IOException {
|
||||
positions.close();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,21 @@
|
||||
package nu.marginalia.index.positions;
|
||||
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
public class TermData {
|
||||
private final ByteBuffer buffer;
|
||||
|
||||
public TermData(ByteBuffer buffer) {
|
||||
this.buffer = buffer;
|
||||
}
|
||||
|
||||
public byte flags() {
|
||||
return buffer.get(0);
|
||||
}
|
||||
|
||||
public GammaCodedSequence positions() {
|
||||
return new GammaCodedSequence(buffer, 1, buffer.capacity());
|
||||
}
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class PositionsFileReaderTest {
|
||||
|
||||
Path file;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws IOException {
|
||||
file = Files.createTempFile("positions", "dat");
|
||||
}
|
||||
@AfterEach
|
||||
void tearDown() throws IOException {
|
||||
Files.delete(file);
|
||||
}
|
||||
|
||||
@Test
|
||||
void getTermData() throws IOException {
|
||||
ByteBuffer workArea = ByteBuffer.allocate(8192);
|
||||
long key1, key2, key3;
|
||||
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
|
||||
key1 = constructor.add((byte) 43, GammaCodedSequence.generate(workArea, 1, 2, 3));
|
||||
key2 = constructor.add((byte) 51, GammaCodedSequence.generate(workArea, 2, 3, 5, 1000, 5000, 20241));
|
||||
key3 = constructor.add((byte) 61, GammaCodedSequence.generate(workArea, 3, 5, 7));
|
||||
}
|
||||
|
||||
System.out.println("key1: " + Long.toHexString(key1));
|
||||
System.out.println("key2: " + Long.toHexString(key2));
|
||||
System.out.println("key3: " + Long.toHexString(key3));
|
||||
|
||||
try (Arena arena = Arena.ofConfined();
|
||||
PositionsFileReader reader = new PositionsFileReader(file))
|
||||
{
|
||||
TermData data1 = reader.getTermData(arena, key1);
|
||||
assertEquals(43, data1.flags());
|
||||
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
|
||||
|
||||
TermData data2 = reader.getTermData(arena, key2);
|
||||
assertEquals(51, data2.flags());
|
||||
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
|
||||
|
||||
TermData data3 = reader.getTermData(arena, key3);
|
||||
assertEquals(61, data3.flags());
|
||||
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
|
||||
}
|
||||
}
|
||||
}
|
@ -1,17 +1,19 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||
import nu.marginalia.index.construction.ReversePreindex;
|
||||
import nu.marginalia.index.construction.TestJournalFactory;
|
||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
@ -47,13 +49,18 @@ class ReverseIndexReaderTest {
|
||||
public void testSimple() throws IOException {
|
||||
|
||||
var indexReader = createIndex(
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51))
|
||||
new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5))
|
||||
);
|
||||
|
||||
assertEquals(1, indexReader.numDocuments(50));
|
||||
|
||||
long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
|
||||
assertArrayEquals(new long[] { 51 }, meta);
|
||||
var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 });
|
||||
|
||||
assertEquals(1, positions.length);
|
||||
assertNotNull(positions[0]);
|
||||
assertEquals((byte) 51, positions[0].flags());
|
||||
assertEquals(IntList.of(1, 3, 5), positions[0].positions().values());
|
||||
|
||||
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||
}
|
||||
|
||||
@ -69,13 +76,8 @@ class ReverseIndexReaderTest {
|
||||
assertEquals(2, indexReader.numDocuments(51));
|
||||
assertEquals(1, indexReader.numDocuments(52));
|
||||
|
||||
assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
|
||||
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||
|
||||
assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
|
||||
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
|
||||
|
||||
assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
|
||||
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
|
||||
|
||||
}
|
||||
@ -91,18 +93,20 @@ class ReverseIndexReaderTest {
|
||||
|
||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||
var reader = journalFactory.createReader(scenario);
|
||||
var preindex = ReversePreindex.constructPreindex(reader,
|
||||
Mockito.mock(PositionsFileConstructor.class),
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
|
||||
|
||||
Path posFile = tempDir.resolve("positions.dat");
|
||||
Path docsFile = tempDir.resolve("docs.dat");
|
||||
Path wordsFile = tempDir.resolve("words.dat");
|
||||
|
||||
preindex.finalizeIndex(docsFile, wordsFile);
|
||||
preindex.delete();
|
||||
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
|
||||
var preindex = ReversePreindex.constructPreindex(reader,
|
||||
positionsFileConstructor,
|
||||
DocIdRewriter.identity(), tempDir);
|
||||
preindex.finalizeIndex(docsFile, wordsFile);
|
||||
preindex.delete();
|
||||
}
|
||||
|
||||
return new ReverseIndexReader("test", wordsFile, docsFile);
|
||||
return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
|
||||
|
||||
}
|
||||
}
|
@ -155,15 +155,15 @@ class ReversePreindexDocsTest {
|
||||
if (wordId != that.wordId) return false;
|
||||
if (start != that.start) return false;
|
||||
if (end != that.end) return false;
|
||||
return Arrays.equals(data, that.data);
|
||||
return data[0] == that.data[0]; //Arrays.equals(data, that.data);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
int result = (int) (wordId ^ (wordId >>> 32));
|
||||
result = 31 * result + (int) (start ^ (start >>> 32));
|
||||
result = 31 * result + (int) (end ^ (end >>> 32));
|
||||
result = 31 * result + Arrays.hashCode(data);
|
||||
int result = Long.hashCode(wordId);
|
||||
result = 31 * result + Long.hashCode(start);
|
||||
result = 31 * result + Long.hashCode(end);
|
||||
result = 31 * result + Long.hashCode(data[0]);
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -79,9 +79,7 @@ class ReversePreindexFinalizeTest {
|
||||
assertEquals(1, wordsHeader.numEntries());
|
||||
|
||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
||||
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
||||
}
|
||||
|
||||
|
||||
@ -122,9 +120,7 @@ class ReversePreindexFinalizeTest {
|
||||
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
|
||||
|
||||
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
||||
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
||||
|
||||
BTreeHeader docsHeader;
|
||||
|
||||
@ -133,13 +129,11 @@ class ReversePreindexFinalizeTest {
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
|
||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
||||
|
||||
docsHeader = new BTreeHeader(docsArray, offset2);
|
||||
System.out.println(docsHeader);
|
||||
assertEquals(1, docsHeader.numEntries());
|
||||
|
||||
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||
assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
||||
}
|
||||
}
|
@ -8,11 +8,13 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Objects;
|
||||
|
||||
public class TestJournalFactory {
|
||||
Path tempDir = Files.createTempDirectory("journal");
|
||||
@ -50,10 +52,10 @@ public class TestJournalFactory {
|
||||
'}';
|
||||
}
|
||||
}
|
||||
public record WordWithMeta(long wordId, long meta) {}
|
||||
public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {}
|
||||
|
||||
public static WordWithMeta wm(long wordId, long meta) {
|
||||
return new WordWithMeta(wordId, meta);
|
||||
public static WordWithMeta wm(long wordId, long meta, int... positions) {
|
||||
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
|
||||
}
|
||||
|
||||
IndexJournalReader createReader(EntryData... entries) throws IOException {
|
||||
@ -71,7 +73,7 @@ public class TestJournalFactory {
|
||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
||||
}
|
||||
|
||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
|
||||
new IndexJournalEntryData(termIds, meta, positions));
|
||||
}
|
||||
writer.close();
|
||||
@ -91,10 +93,10 @@ public class TestJournalFactory {
|
||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||
termIds[i] = entry.wordIds[i].wordId;
|
||||
meta[i] = entry.wordIds[i].meta;
|
||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
||||
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
|
||||
}
|
||||
|
||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
|
||||
new IndexJournalEntryData(termIds, meta, positions));
|
||||
}
|
||||
writer.close();
|
||||
|
@ -4,11 +4,10 @@ import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.positions.PositionsFileReader;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
@ -40,17 +39,18 @@ public class IndexFactory {
|
||||
}
|
||||
|
||||
public ReverseIndexReader getReverseIndexReader() throws IOException {
|
||||
|
||||
return new ReverseIndexReader("full",
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT)
|
||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
|
||||
);
|
||||
}
|
||||
|
||||
public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
||||
return new ReverseIndexReader("prio",
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||
null
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -281,10 +281,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
awaitCompletion();
|
||||
|
||||
// Return the best results
|
||||
return new SearchResultSet(
|
||||
resultValuator.selectBestResults(parameters,
|
||||
resultRankingContext,
|
||||
resultHeap));
|
||||
return new SearchResultSet(resultValuator.selectBestResults(parameters, resultHeap));
|
||||
}
|
||||
|
||||
/** Wait for all tasks to complete */
|
||||
|
@ -14,12 +14,13 @@ import nu.marginalia.index.query.IndexQueryBuilder;
|
||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.lang.foreign.Arena;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
@ -169,8 +170,11 @@ public class CombinedIndexReader {
|
||||
}
|
||||
|
||||
/** Retrieves the term metadata for the specified word for the provided documents */
|
||||
public DocMetadataList getMetadata(long wordId, CombinedDocIdList docIds) {
|
||||
return new DocMetadataList(reverseIndexFullReader.getTermMeta(wordId, docIds.array()));
|
||||
public TermMetadataList getTermMetadata(Arena arena,
|
||||
long wordId,
|
||||
CombinedDocIdList docIds)
|
||||
{
|
||||
return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
|
||||
}
|
||||
|
||||
/** Retrieves the document metadata for the specified document */
|
||||
@ -186,8 +190,12 @@ public class CombinedIndexReader {
|
||||
/** Retrieves the HTML features for the specified document */
|
||||
public int getHtmlFeatures(long docId) {
|
||||
return forwardIndexReader.getHtmlFeatures(docId);
|
||||
} /** Retrieves the HTML features for the specified document */
|
||||
public int getDocumentSize(long docId) {
|
||||
return forwardIndexReader.getDocumentSize(docId);
|
||||
}
|
||||
|
||||
|
||||
/** Close the indexes (this is not done immediately)
|
||||
* */
|
||||
public void close() throws InterruptedException {
|
||||
|
@ -10,12 +10,13 @@ import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||
import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
|
||||
import java.lang.foreign.Arena;
|
||||
|
||||
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
|
||||
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
|
||||
|
||||
public class IndexMetadataService {
|
||||
private final StatefulIndex statefulIndex;
|
||||
@ -25,22 +26,19 @@ public class IndexMetadataService {
|
||||
this.statefulIndex = index;
|
||||
}
|
||||
|
||||
public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll,
|
||||
TermIdList termIdsList)
|
||||
public Long2ObjectArrayMap<TermMetadataList>
|
||||
getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList)
|
||||
{
|
||||
var currentIndex = statefulIndex.get();
|
||||
|
||||
Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta =
|
||||
Long2ObjectArrayMap<TermMetadataList> termdocToMeta =
|
||||
new Long2ObjectArrayMap<>(termIdsList.size());
|
||||
|
||||
for (long termId : termIdsList.array()) {
|
||||
var metadata = currentIndex.getMetadata(termId, combinedIdsAll);
|
||||
|
||||
termdocToMeta.put(termId,
|
||||
new DocumentsWithMetadata(combinedIdsAll, metadata));
|
||||
termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll));
|
||||
}
|
||||
|
||||
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
|
||||
return termdocToMeta;
|
||||
}
|
||||
|
||||
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||
|
@ -1,25 +1,22 @@
|
||||
package nu.marginalia.index.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.*;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.ranking.results.ResultValuator;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.List;
|
||||
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
|
||||
|
||||
/** This class is responsible for calculating the score of a search result.
|
||||
* It holds the data required to perform the scoring, as there is strong
|
||||
@ -28,94 +25,74 @@ public class IndexResultValuationContext {
|
||||
private final CombinedIndexReader index;
|
||||
private final QueryParams queryParams;
|
||||
|
||||
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
|
||||
private final QuerySearchTerms searchTerms;
|
||||
|
||||
private final ResultRankingContext rankingContext;
|
||||
private final ResultValuator searchResultValuator;
|
||||
private final CompiledQuery<String> compiledQuery;
|
||||
private final CompiledQueryLong compiledQueryIds;
|
||||
|
||||
public IndexResultValuationContext(IndexMetadataService metadataService,
|
||||
ResultValuator searchResultValuator,
|
||||
CombinedDocIdList ids,
|
||||
public IndexResultValuationContext(ResultValuator searchResultValuator,
|
||||
StatefulIndex statefulIndex,
|
||||
ResultRankingContext rankingContext,
|
||||
SearchParameters params
|
||||
) {
|
||||
SearchParameters params)
|
||||
{
|
||||
this.index = statefulIndex.get();
|
||||
this.rankingContext = rankingContext;
|
||||
this.searchResultValuator = searchResultValuator;
|
||||
|
||||
this.queryParams = params.queryParams;
|
||||
this.compiledQuery = params.compiledQuery;
|
||||
this.compiledQueryIds = params.compiledQueryIds;
|
||||
|
||||
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||
|
||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids,
|
||||
searchTerms.termIdsAll);
|
||||
}
|
||||
|
||||
private final long flagsFilterMask =
|
||||
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculatePreliminaryScore(long combinedId) {
|
||||
public SearchResultItem calculatePreliminaryScore(long combinedId,
|
||||
QuerySearchTerms searchTerms,
|
||||
long[] wordFlags,
|
||||
GammaCodedSequence[] positions)
|
||||
{
|
||||
|
||||
|
||||
// FIXME: Reconsider coherence logic with the new position data
|
||||
// if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
|
||||
// return null;
|
||||
|
||||
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||
int[] counts = new int[compiledQuery.size()];
|
||||
for (int i = 0; i < counts.length; i++) {
|
||||
if (positions[i] != null) {
|
||||
counts[i] = positions[i].valueCount();
|
||||
}
|
||||
}
|
||||
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
|
||||
|
||||
// If the document is not relevant to the query, abort early to reduce allocations and
|
||||
// avoid unnecessary calculations
|
||||
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
long docId = UrlIdCodec.removeRank(combinedId);
|
||||
|
||||
if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
|
||||
return null;
|
||||
|
||||
long docMetadata = index.getDocumentMetadata(docId);
|
||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
hasPrioTerm(combinedId));
|
||||
|
||||
long[] wordMetas = new long[compiledQuery.size()];
|
||||
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
|
||||
|
||||
for (int i = 0; i < wordMetas.length; i++) {
|
||||
final long termId = compiledQueryIds.at(i);
|
||||
final String term = compiledQuery.at(i);
|
||||
|
||||
wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId);
|
||||
scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]);
|
||||
}
|
||||
|
||||
|
||||
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
|
||||
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
|
||||
// a very flimsy assumption.
|
||||
searchResult.keywordScores.addAll(List.of(scores));
|
||||
|
||||
CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
|
||||
|
||||
|
||||
boolean allSynthetic = CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isPresent);
|
||||
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask));
|
||||
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta)));
|
||||
|
||||
if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
|
||||
return null;
|
||||
int docSize = index.getDocumentSize(docId);
|
||||
|
||||
double score = searchResultValuator.calculateSearchResultValue(
|
||||
wordMetasQuery,
|
||||
wordFlagsQuery,
|
||||
positionsCountQuery,
|
||||
positionsQuery,
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
5000, // use a dummy value here as it's not present in the index
|
||||
docSize,
|
||||
rankingContext,
|
||||
null);
|
||||
|
||||
if (searchResult.hasPrioTerm) {
|
||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||
docMetadata,
|
||||
htmlFeatures);
|
||||
|
||||
if (hasPrioTerm(searchTerms, positions)) {
|
||||
score = 0.75 * score;
|
||||
}
|
||||
|
||||
@ -124,13 +101,32 @@ public class IndexResultValuationContext {
|
||||
return searchResult;
|
||||
}
|
||||
|
||||
private boolean hasPrioTerm(long combinedId) {
|
||||
for (var term : searchTerms.termIdsPrio.array()) {
|
||||
if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) {
|
||||
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
|
||||
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
|
||||
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
|
||||
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
|
||||
|
||||
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
||||
return true;
|
||||
}
|
||||
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
|
||||
var allTerms = searchTerms.termIdsAll;
|
||||
var prioTerms = searchTerms.termIdsPrio;
|
||||
|
||||
for (int i = 0; i < allTerms.size(); i++) {
|
||||
if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
|
||||
@ -142,7 +138,7 @@ public class IndexResultValuationContext {
|
||||
return true;
|
||||
}
|
||||
|
||||
return CompiledQueryAggregates.booleanAggregate(queryGraphScores,
|
||||
return booleanAggregate(queryGraphScores,
|
||||
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
|
||||
}
|
||||
|
||||
|
@ -7,8 +7,6 @@ import gnu.trove.list.array.TLongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
@ -21,12 +19,13 @@ import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.ranking.results.ResultValuator;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.lang.foreign.Arena;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
@Singleton
|
||||
public class IndexResultValuatorService {
|
||||
@ -53,35 +52,53 @@ public class IndexResultValuatorService {
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds)
|
||||
{
|
||||
final var evaluator = createValuationContext(params, rankingContext, resultIds);
|
||||
IndexResultValuationContext evaluator =
|
||||
new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params);
|
||||
|
||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||
|
||||
for (long id : resultIds.array()) {
|
||||
var score = evaluator.calculatePreliminaryScore(id);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
try (var arena = Arena.ofConfined()) {
|
||||
// Batch-fetch the word metadata for the documents
|
||||
|
||||
var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||
var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll);
|
||||
|
||||
// Prepare data for the document. We do this outside of the calculation function to avoid
|
||||
// hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there;
|
||||
// out here we can rely on implicit array ordering to match up the data.
|
||||
|
||||
var ra = resultIds.array();
|
||||
long[] flags = new long[searchTerms.termIdsAll.size()];
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()];
|
||||
|
||||
for (int i = 0; i < ra.length; i++) {
|
||||
long id = ra[i];
|
||||
|
||||
// Prepare term-level data for the document
|
||||
for (int ti = 0; ti < flags.length; ti++) {
|
||||
long tid = searchTerms.termIdsAll.at(ti);
|
||||
var tfd = termsForDocs.get(tid);
|
||||
|
||||
assert tfd != null : "No term data for term " + ti;
|
||||
|
||||
flags[ti] = tfd.flag(i);
|
||||
positions[ti] = tfd.position(i);
|
||||
}
|
||||
|
||||
// Calculate the preliminary score
|
||||
|
||||
var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
private IndexResultValuationContext createValuationContext(SearchParameters params,
|
||||
ResultRankingContext rankingContext,
|
||||
CombinedDocIdList resultIds)
|
||||
{
|
||||
return new IndexResultValuationContext(metadataService,
|
||||
resultValuator,
|
||||
resultIds,
|
||||
statefulIndex,
|
||||
rankingContext,
|
||||
params);
|
||||
}
|
||||
|
||||
|
||||
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
|
||||
ResultRankingContext rankingContext,
|
||||
Collection<SearchResultItem> results) throws SQLException {
|
||||
|
||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||
@ -101,14 +118,13 @@ public class IndexResultValuatorService {
|
||||
item.resultsFromDomain = domainCountFilter.getCount(item);
|
||||
}
|
||||
|
||||
return decorateAndRerank(resultsList, params.compiledQuery, rankingContext);
|
||||
return decorateResults(resultsList, params.compiledQuery);
|
||||
}
|
||||
|
||||
/** Decorate the result items with additional information from the link database
|
||||
* and calculate an updated ranking with the additional information */
|
||||
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
|
||||
CompiledQuery<String> compiledQuery,
|
||||
ResultRankingContext rankingContext)
|
||||
public List<DecoratedSearchResultItem> decorateResults(List<SearchResultItem> rawResults,
|
||||
CompiledQuery<String> compiledQuery)
|
||||
throws SQLException
|
||||
{
|
||||
TLongList idsList = new TLongArrayList(rawResults.size());
|
||||
@ -131,42 +147,18 @@ public class IndexResultValuatorService {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Reconstruct the compiledquery for re-valuation
|
||||
//
|
||||
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
|
||||
// order as the data for the CompiledQuery<String>.
|
||||
long[] wordMetas = new long[compiledQuery.size()];
|
||||
|
||||
for (int i = 0; i < compiledQuery.size(); i++) {
|
||||
var score = result.keywordScores.get(i);
|
||||
wordMetas[i] = score.encodedWordMetadata();
|
||||
}
|
||||
|
||||
CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
|
||||
|
||||
resultItems.add(createCombinedItem(
|
||||
result,
|
||||
docData,
|
||||
metaQuery,
|
||||
rankingContext));
|
||||
docData));
|
||||
}
|
||||
return resultItems;
|
||||
}
|
||||
|
||||
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
|
||||
DocdbUrlDetail docData,
|
||||
CompiledQueryLong wordMetas,
|
||||
ResultRankingContext rankingContext) {
|
||||
DocdbUrlDetail docData) {
|
||||
|
||||
ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor();
|
||||
Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
|
||||
|
||||
double score = resultValuator.calculateSearchResultValue(wordMetas,
|
||||
result.encodedDocMetadata,
|
||||
result.htmlFeatures,
|
||||
docData.wordsTotal(),
|
||||
rankingContext,
|
||||
detailConsumer);
|
||||
// Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
|
||||
|
||||
return new DecoratedSearchResultItem(
|
||||
result,
|
||||
@ -179,8 +171,8 @@ public class IndexResultValuatorService {
|
||||
docData.pubYear(),
|
||||
docData.dataHash(),
|
||||
docData.wordsTotal(),
|
||||
bestPositions(wordMetas),
|
||||
score,
|
||||
0L, //bestPositions(wordMetas),
|
||||
result.getScore(),
|
||||
detailsExtractor.get()
|
||||
);
|
||||
}
|
||||
|
@ -1,26 +1,38 @@
|
||||
package nu.marginalia.index.results.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
||||
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
public class TermMetadataForCombinedDocumentIds {
|
||||
private static final Logger logger = LoggerFactory.getLogger(TermMetadataForCombinedDocumentIds.class);
|
||||
private final Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta;
|
||||
|
||||
public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta) {
|
||||
this.termdocToMeta = termdocToMeta;
|
||||
}
|
||||
|
||||
public long getTermMetadata(long termId, long combinedId) {
|
||||
public byte getTermMetadata(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
if (metaByCombinedId == null) {
|
||||
return 0;
|
||||
}
|
||||
return metaByCombinedId.get(combinedId);
|
||||
return metaByCombinedId.get(combinedId).flags();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public GammaCodedSequence getPositions(long termId, long combinedId) {
|
||||
var metaByCombinedId = termdocToMeta.get(termId);
|
||||
|
||||
if (metaByCombinedId == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return metaByCombinedId.get(combinedId).positions();
|
||||
}
|
||||
|
||||
public boolean hasTermMeta(long termId, long combinedId) {
|
||||
@ -30,16 +42,25 @@ public class TermMetadataForCombinedDocumentIds {
|
||||
return false;
|
||||
}
|
||||
|
||||
return metaByCombinedId.get(combinedId) != 0;
|
||||
return metaByCombinedId.data().containsKey(combinedId);
|
||||
}
|
||||
|
||||
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
|
||||
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
|
||||
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));
|
||||
public record DocumentsWithMetadata(Long2ObjectOpenHashMap<TermData> data) {
|
||||
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) {
|
||||
this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size()));
|
||||
|
||||
long[] ids = combinedDocIdsAll.array();
|
||||
TermData[] data = metadata.array();
|
||||
|
||||
for (int i = 0; i < combinedDocIdsAll.size(); i++) {
|
||||
if (data[i] != null) {
|
||||
this.data.put(ids[i], data[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public long get(long combinedId) {
|
||||
return data.getOrDefault(combinedId, 0);
|
||||
public TermData get(long combinedId) {
|
||||
return data.get(combinedId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -15,6 +15,10 @@ import java.util.stream.LongStream;
|
||||
public final class CombinedDocIdList {
|
||||
private final long[] data;
|
||||
|
||||
public CombinedDocIdList(long... data) {
|
||||
this.data = Arrays.copyOf(data, data.length);
|
||||
}
|
||||
|
||||
public CombinedDocIdList(LongArrayList data) {
|
||||
this.data = data.toLongArray();
|
||||
}
|
||||
|
@ -1,45 +0,0 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Objects;
|
||||
import java.util.stream.LongStream;
|
||||
|
||||
public final class DocMetadataList {
|
||||
private final long[] array;
|
||||
|
||||
public DocMetadataList(long[] array) {
|
||||
this.array = array;
|
||||
}
|
||||
|
||||
public DocMetadataList(LongArrayList list) {
|
||||
this(list.toLongArray());
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return array.length;
|
||||
}
|
||||
|
||||
public LongStream stream() {
|
||||
return LongStream.of(array);
|
||||
}
|
||||
|
||||
public long[] array() {
|
||||
return array;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||
var that = (DocMetadataList) obj;
|
||||
return Arrays.equals(this.array, that.array);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Arrays.hashCode(array);
|
||||
}
|
||||
|
||||
}
|
@ -11,6 +11,7 @@ public final class TermIdList {
|
||||
|
||||
public TermIdList(long[] array) {
|
||||
this.array = array;
|
||||
Arrays.sort(this.array);
|
||||
}
|
||||
|
||||
public TermIdList(LongArrayList list) {
|
||||
@ -29,6 +30,15 @@ public final class TermIdList {
|
||||
return array;
|
||||
}
|
||||
|
||||
public long at(int i) {
|
||||
return array[i];
|
||||
}
|
||||
|
||||
public boolean contains(long id) {
|
||||
// Implicitly sorted
|
||||
return Arrays.binarySearch(array, id) >= 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
|
@ -0,0 +1,55 @@
|
||||
package nu.marginalia.index.results.model.ids;
|
||||
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.util.Arrays;
|
||||
|
||||
public final class TermMetadataList {
|
||||
private final TermData[] array;
|
||||
|
||||
public TermMetadataList(TermData[] array) {
|
||||
this.array = array;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return array.length;
|
||||
}
|
||||
|
||||
public long flag(int i) {
|
||||
if (array[i] == null)
|
||||
return 0;
|
||||
|
||||
return array[i].flags();
|
||||
}
|
||||
|
||||
/** Returns the position data for the given document index,
|
||||
* may be null if the term is not in the document
|
||||
*/
|
||||
@Nullable
|
||||
public GammaCodedSequence position(int i) {
|
||||
if (array[i] == null)
|
||||
return null;
|
||||
|
||||
return array[i].positions();
|
||||
}
|
||||
|
||||
public TermData[] array() {
|
||||
return array;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (obj == this) return true;
|
||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||
var that = (TermMetadataList) obj;
|
||||
return Arrays.equals(this.array, that.array);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Arrays.hashCode(array);
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package nu.marginalia.ranking.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
@ -14,6 +16,7 @@ import nu.marginalia.ranking.results.factors.*;
|
||||
|
||||
import com.google.inject.Inject;
|
||||
import com.google.inject.Singleton;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
@ -33,15 +36,15 @@ public class ResultValuator {
|
||||
this.termCoherenceFactor = termCoherenceFactor;
|
||||
}
|
||||
|
||||
public double calculateSearchResultValue(CompiledQueryLong wordMeta,
|
||||
long documentMetadata,
|
||||
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
||||
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
|
||||
int features,
|
||||
int length,
|
||||
ResultRankingContext ctx,
|
||||
@Nullable Consumer<ResultRankingDetails> detailsConsumer
|
||||
)
|
||||
{
|
||||
if (wordMeta.isEmpty())
|
||||
if (wordFlagsQuery.isEmpty())
|
||||
return Double.MAX_VALUE;
|
||||
|
||||
if (length < 0) {
|
||||
@ -82,12 +85,11 @@ public class ResultValuator {
|
||||
+ temporalBias
|
||||
+ flagsPenalty;
|
||||
|
||||
double tcfOverlap = rankingParams.tcfOverlapWeight * termCoherenceFactor.calculateOverlap(wordMeta);
|
||||
double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
|
||||
// FIXME: need a weighting factor here
|
||||
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
|
||||
|
||||
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
|
||||
double bM25N = rankingParams.bm25NgramWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
|
||||
double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx));
|
||||
double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx));
|
||||
double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx));
|
||||
|
||||
double overallPartPositive = Math.max(0, overallPart);
|
||||
double overallPartNegative = -Math.min(0, overallPart);
|
||||
@ -112,10 +114,10 @@ public class ResultValuator {
|
||||
temporalBias,
|
||||
flagsPenalty,
|
||||
overallPart,
|
||||
tcfOverlap,
|
||||
tcfJaccard,
|
||||
0,
|
||||
0,
|
||||
bM25F,
|
||||
bM25N,
|
||||
0, // FIXME: Remove from model
|
||||
bM25P)
|
||||
);
|
||||
|
||||
@ -125,8 +127,8 @@ public class ResultValuator {
|
||||
// Renormalize to 0...15, where 0 is the best possible score;
|
||||
// this is a historical artifact of the original ranking function
|
||||
double ret = normalize(
|
||||
tcfOverlap + tcfJaccard
|
||||
+ bM25F + bM25P + bM25N
|
||||
tcfAvgDist
|
||||
+ bM25F + bM25P
|
||||
+ overallPartPositive,
|
||||
overallPartNegative);
|
||||
|
||||
|
@ -13,7 +13,7 @@ import java.util.List;
|
||||
public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
private static final long AVG_LENGTH = 5000;
|
||||
|
||||
private final CqDataLong wordMetaData;
|
||||
private final CqDataInt counts;
|
||||
private final CqDataInt frequencies;
|
||||
private final Bm25Parameters bm25Parameters;
|
||||
|
||||
@ -22,31 +22,16 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
|
||||
private final BitSet mask;
|
||||
|
||||
private Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
|
||||
CqDataLong wordMetaData,
|
||||
public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
|
||||
CqDataInt counts,
|
||||
int length,
|
||||
BitSet mask,
|
||||
ResultRankingContext ctx) {
|
||||
this.length = length;
|
||||
this.bm25Parameters = bm25Parameters;
|
||||
this.docCount = ctx.termFreqDocCount();
|
||||
this.wordMetaData = wordMetaData;
|
||||
this.counts = counts;
|
||||
this.frequencies = ctx.fullCounts;
|
||||
this.mask = mask;
|
||||
}
|
||||
|
||||
public static Bm25FullGraphVisitor forRegular(Bm25Parameters bm25Parameters,
|
||||
CqDataLong wordMetaData,
|
||||
int length,
|
||||
ResultRankingContext ctx) {
|
||||
return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.regularMask, ctx);
|
||||
}
|
||||
|
||||
public static Bm25FullGraphVisitor forNgrams(Bm25Parameters bm25Parameters,
|
||||
CqDataLong wordMetaData,
|
||||
int length,
|
||||
ResultRankingContext ctx) {
|
||||
return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.ngramsMask, ctx);
|
||||
this.mask = ctx.regularMask;
|
||||
}
|
||||
|
||||
@Override
|
||||
@ -73,7 +58,7 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
||||
return 0;
|
||||
}
|
||||
|
||||
double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx)));
|
||||
double count = counts.get(idx);
|
||||
|
||||
int freq = frequencies.get(idx);
|
||||
|
||||
|
@ -1,66 +1,44 @@
|
||||
package nu.marginalia.ranking.results.factors;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
|
||||
/** Rewards documents where terms appear frequently within the same sentences
|
||||
*/
|
||||
public class TermCoherenceFactor {
|
||||
|
||||
/** Calculate a factor that rewards the best total position overlap
|
||||
* between the terms in the query. This is high when all the terms
|
||||
* found in the same sentences.
|
||||
*/
|
||||
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
|
||||
if (wordMetadataQuery.size() < 2)
|
||||
return 0;
|
||||
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
|
||||
score -> score >>> WordMetadata.POSITIONS_SHIFT);
|
||||
|
||||
return bitsSetFactor(mask);
|
||||
}
|
||||
|
||||
/** Calculate a factor that rewards the best average mutual Jaccard index
|
||||
* between the terms in the query. This is high when the several terms are frequently
|
||||
* found in the same sentences.
|
||||
*/
|
||||
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
|
||||
public double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
|
||||
double sum = 0;
|
||||
int cnt = 0;
|
||||
|
||||
for (int i = 0; i < wordMetadataQuery.size(); i++) {
|
||||
for (int i = 0; i < positions.size(); i++) {
|
||||
|
||||
// Skip terms that are not in the regular mask
|
||||
if (!ctx.regularMask.get(i))
|
||||
continue;
|
||||
|
||||
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
|
||||
var posi = positions.at(i);
|
||||
|
||||
// Skip terms that are not in the document
|
||||
if (imask == 0L)
|
||||
if (posi == null)
|
||||
continue;
|
||||
|
||||
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
|
||||
for (int j = i + 1; j < positions.size(); j++) {
|
||||
|
||||
// Skip terms that are not in the regular mask
|
||||
if (!ctx.regularMask.get(j))
|
||||
continue;
|
||||
|
||||
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
|
||||
var posj = positions.at(j);
|
||||
|
||||
// Skip terms that are not in the document
|
||||
if (jmask == 0L)
|
||||
if (posj == null)
|
||||
continue;
|
||||
|
||||
long quot = Long.bitCount(imask & jmask);
|
||||
long rem = Long.bitCount(imask | jmask);
|
||||
|
||||
// rem is always > 0 because imask and jmask are not both 0
|
||||
|
||||
sum += quot/(double) rem;
|
||||
int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
|
||||
sum += distance;
|
||||
cnt++;
|
||||
}
|
||||
}
|
||||
@ -68,15 +46,8 @@ public class TermCoherenceFactor {
|
||||
if (cnt > 0) {
|
||||
return sum / cnt;
|
||||
} else {
|
||||
return 0;
|
||||
return 1000.;
|
||||
}
|
||||
}
|
||||
|
||||
double bitsSetFactor(long mask) {
|
||||
final int bitsSetInMask = Long.bitCount(mask);
|
||||
|
||||
return Math.pow(bitsSetInMask/(double) WordMetadata.POSITIONS_COUNT, 0.25);
|
||||
}
|
||||
|
||||
|
||||
}
|
382
code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java
Normal file
382
code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java
Normal file
@ -0,0 +1,382 @@
|
||||
package nu.marginalia.index;
|
||||
|
||||
import com.google.inject.Guice;
|
||||
import com.google.inject.Inject;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||
import it.unimi.dsi.fastutil.longs.LongList;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.array.page.LongQueryBuffer;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.index.positions.TermData;
|
||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||
import nu.marginalia.model.EdgeUrl;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||
import nu.marginalia.service.server.Initialization;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.parallel.Execution;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.net.URISyntaxException;
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
|
||||
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||
|
||||
@Execution(SAME_THREAD)
|
||||
public class CombinedIndexReaderTest {
|
||||
|
||||
@Inject
|
||||
Initialization initialization;
|
||||
|
||||
IndexQueryServiceIntegrationTestModule testModule;
|
||||
|
||||
@Inject
|
||||
StatefulIndex statefulIndex;
|
||||
|
||||
@Inject
|
||||
IndexJournalWriter indexJournalWriter;
|
||||
|
||||
@Inject
|
||||
FileStorageService fileStorageService;
|
||||
|
||||
@Inject
|
||||
DomainRankings domainRankings;
|
||||
|
||||
@Inject
|
||||
ProcessHeartbeat processHeartbeat;
|
||||
@Inject
|
||||
DocumentDbReader documentDbReader;
|
||||
|
||||
@Inject
|
||||
IndexFactory indexFactory;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException {
|
||||
|
||||
testModule = new IndexQueryServiceIntegrationTestModule();
|
||||
Guice.createInjector(testModule).injectMembers(this);
|
||||
|
||||
initialization.setReady();
|
||||
}
|
||||
|
||||
@AfterEach
|
||||
public void tearDown() throws IOException {
|
||||
testModule.cleanUp();
|
||||
}
|
||||
|
||||
private final MockDocumentMeta anyMetadata = new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class)));
|
||||
|
||||
@Test
|
||||
public void testSimpleRetrieval() throws Exception {
|
||||
new MockData().add(
|
||||
d(1, 1),
|
||||
anyMetadata,
|
||||
w("hello", WordFlags.Title, 33, 55),
|
||||
w("world", WordFlags.Subjects, 34)
|
||||
).load();
|
||||
|
||||
var reader = indexFactory.getCombinedIndexReader();
|
||||
var query = reader.findFullWord(kw("hello")).build();
|
||||
|
||||
var buffer = new LongQueryBuffer(32);
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
assertEquals(
|
||||
List.of(d(1, 1)),
|
||||
decode(buffer)
|
||||
);
|
||||
|
||||
var helloMeta = td(reader, kw("hello"), d(1, 1));
|
||||
assertEquals(helloMeta.flags(), WordFlags.Title.asBit());
|
||||
assertEquals(IntList.of(33, 55), helloMeta.positions().values());
|
||||
|
||||
var worldMeta = td(reader, kw("world"), d(1, 1));
|
||||
assertEquals(worldMeta.flags(), WordFlags.Subjects.asBit());
|
||||
assertEquals(IntList.of(34), worldMeta.positions().values());
|
||||
}
|
||||
|
||||
TermData td(CombinedIndexReader reader, long wordId, MockDataDocument docId) {
|
||||
return (reader.getTermMetadata(Arena.global(), wordId, new CombinedDocIdList(docId.docId())).array())[0];
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testUnionRetrieval() throws Exception {
|
||||
new MockData()
|
||||
.add(
|
||||
d(1, 1),
|
||||
anyMetadata,
|
||||
w("hello", WordFlags.Title),
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.add(
|
||||
d(1, 2),
|
||||
anyMetadata,
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.add(
|
||||
d(1, 3),
|
||||
anyMetadata,
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.add(
|
||||
d(2, 4),
|
||||
anyMetadata,
|
||||
w("hello", WordFlags.Title),
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.load();
|
||||
|
||||
var reader = indexFactory.getCombinedIndexReader();
|
||||
var query = reader
|
||||
.findFullWord(kw("hello"))
|
||||
.also(kw("world"))
|
||||
.build();
|
||||
|
||||
var buffer = new LongQueryBuffer(32);
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
assertEquals(
|
||||
List.of(d(1, 1), d(2, 4)),
|
||||
decode(buffer)
|
||||
);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNotFilterRetrieval() throws Exception {
|
||||
new MockData()
|
||||
.add(
|
||||
d(1, 1),
|
||||
anyMetadata,
|
||||
w("hello", WordFlags.Title),
|
||||
w("world", WordFlags.Title),
|
||||
w("goodbye", WordFlags.Title)
|
||||
)
|
||||
.add(
|
||||
d(1, 2),
|
||||
anyMetadata,
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.add(
|
||||
d(1, 3),
|
||||
anyMetadata,
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.add(
|
||||
d(2, 4),
|
||||
anyMetadata,
|
||||
w("hello", WordFlags.Title),
|
||||
w("world", WordFlags.Title)
|
||||
)
|
||||
.load();
|
||||
|
||||
var reader = indexFactory.getCombinedIndexReader();
|
||||
var query = reader.findFullWord(kw("hello"))
|
||||
.also(kw("world"))
|
||||
.not(kw("goodbye"))
|
||||
.build();
|
||||
|
||||
var buffer = new LongQueryBuffer(32);
|
||||
query.getMoreResults(buffer);
|
||||
|
||||
assertEquals(
|
||||
List.of(d(2, 4)),
|
||||
decode(buffer)
|
||||
);
|
||||
}
|
||||
|
||||
List<MockDataDocument> decode(LongQueryBuffer buffer) {
|
||||
List<MockDataDocument> result = new ArrayList<>();
|
||||
for (int i = 0; i < buffer.size(); i++) {
|
||||
result.add(new MockDataDocument(buffer.data.get(i)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
private MockDataDocument d(int domainId, int ordinal) {
|
||||
return new MockDataDocument(domainId, ordinal);
|
||||
}
|
||||
|
||||
private void constructIndex() throws IOException {
|
||||
createForwardIndex();
|
||||
createFullReverseIndex();
|
||||
createPrioReverseIndex();
|
||||
}
|
||||
|
||||
private void createFullReverseIndex() throws IOException {
|
||||
|
||||
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor =
|
||||
new ReverseIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
IndexJournalReader::singleFile,
|
||||
DocIdRewriter.identity(),
|
||||
tmpDir);
|
||||
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
|
||||
}
|
||||
|
||||
private void createPrioReverseIndex() throws IOException {
|
||||
|
||||
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path outputFilePositions = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.POSITIONS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path tmpDir = workDir.resolve("tmp");
|
||||
|
||||
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||
|
||||
var constructor = new ReverseIndexConstructor(
|
||||
outputFileDocs,
|
||||
outputFileWords,
|
||||
outputFilePositions,
|
||||
IndexJournalReader::singleFile,
|
||||
DocIdRewriter.identity(),
|
||||
tmpDir);
|
||||
|
||||
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
|
||||
}
|
||||
|
||||
private void createForwardIndex() throws IOException {
|
||||
|
||||
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||
|
||||
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
|
||||
IndexJournalReader.paging(workDir),
|
||||
outputFileDocsId,
|
||||
outputFileDocsData,
|
||||
domainRankings
|
||||
);
|
||||
|
||||
converter.convert();
|
||||
}
|
||||
|
||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
|
||||
long kw(String s) {
|
||||
return hasher.hashKeyword(s);
|
||||
}
|
||||
|
||||
class MockData {
|
||||
private final Map<Long, List<MockDataKeyword>> allData = new HashMap<>();
|
||||
private final Map<Long, MockDocumentMeta> metaByDoc = new HashMap<>();
|
||||
|
||||
public MockData add(MockDataDocument document,
|
||||
MockDocumentMeta meta,
|
||||
MockDataKeyword... words)
|
||||
{
|
||||
long id = UrlIdCodec.encodeId(document.domainId, document.ordinal);
|
||||
|
||||
allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words));
|
||||
metaByDoc.put(id, meta);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
void load() throws IOException, SQLException, URISyntaxException {
|
||||
allData.forEach((doc, words) -> {
|
||||
|
||||
var meta = metaByDoc.get(doc);
|
||||
|
||||
var header = new IndexJournalEntryHeader(
|
||||
doc,
|
||||
meta.features,
|
||||
100,
|
||||
meta.documentMetadata.encode()
|
||||
);
|
||||
|
||||
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
|
||||
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
|
||||
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new);
|
||||
|
||||
indexJournalWriter.put(header,
|
||||
new IndexJournalEntryData(keywords, metadata, positions));
|
||||
});
|
||||
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||
);
|
||||
for (Long key : allData.keySet()) {
|
||||
linkdbWriter.add(new DocdbUrlDetail(
|
||||
key,
|
||||
new EdgeUrl("https://www.example.com"),
|
||||
"test",
|
||||
"test",
|
||||
0.,
|
||||
"HTML5",
|
||||
0,
|
||||
null,
|
||||
0,
|
||||
5
|
||||
));
|
||||
}
|
||||
linkdbWriter.close();
|
||||
|
||||
indexJournalWriter.close();
|
||||
constructIndex();
|
||||
documentDbReader.reconnect();
|
||||
statefulIndex.switchIndex();
|
||||
}
|
||||
}
|
||||
|
||||
record MockDataDocument(int domainId, int ordinal) {
|
||||
public MockDataDocument(long encodedId) {
|
||||
this(UrlIdCodec.getDomainId(encodedId), UrlIdCodec.getDocumentOrdinal(encodedId));
|
||||
}
|
||||
|
||||
public long docId() {
|
||||
return UrlIdCodec.encodeId(domainId, ordinal);
|
||||
}
|
||||
|
||||
}
|
||||
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {}
|
||||
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {}
|
||||
|
||||
MockDataKeyword w(String keyword, WordFlags flags, int... positions) {
|
||||
return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions));
|
||||
|
||||
}
|
||||
}
|
@ -13,7 +13,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
@ -142,6 +141,53 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
Assertions.assertArrayEquals(ids, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSimple() throws Exception {
|
||||
var linkdbWriter = new DocumentDbWriter(
|
||||
IndexLocations.getLinkdbLivePath(fileStorageService)
|
||||
.resolve(DOCDB_FILE_NAME)
|
||||
);
|
||||
for (int i = 1; i < 512; i++) {
|
||||
loadData(linkdbWriter, i);
|
||||
}
|
||||
linkdbWriter.close();
|
||||
documentDbReader.reconnect();
|
||||
|
||||
indexJournalWriter.close();
|
||||
constructIndex();
|
||||
statefulIndex.switchIndex();
|
||||
|
||||
var rsp = queryService.justQuery(
|
||||
SearchSpecification.builder()
|
||||
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||
.queryStrategy(QueryStrategy.SENTENCE)
|
||||
.year(SpecificationLimit.none())
|
||||
.quality(SpecificationLimit.none())
|
||||
.size(SpecificationLimit.none())
|
||||
.rank(SpecificationLimit.none())
|
||||
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||
.domains(new ArrayList<>())
|
||||
.searchSetIdentifier("NONE")
|
||||
.query(
|
||||
SearchQuery.builder("2")
|
||||
.include("2")
|
||||
.build()
|
||||
).build()
|
||||
);
|
||||
|
||||
int[] idxes = new int[] { 62, 222, 382, 60, 124, 220, 284, 380, 444, 122 };
|
||||
long[] ids = IntStream.of(idxes).mapToLong(Long::valueOf).toArray();
|
||||
long[] actual = rsp.results
|
||||
.stream()
|
||||
.mapToLong(i -> i.rawIndexResult.getDocumentId())
|
||||
.map(UrlIdCodec::getDocumentOrdinal)
|
||||
.toArray();
|
||||
|
||||
System.out.println(Arrays.toString(actual));
|
||||
System.out.println(Arrays.toString(ids));
|
||||
Assertions.assertArrayEquals(ids, actual);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testDomainQuery() throws Exception {
|
||||
|
||||
@ -297,7 +343,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||
}
|
||||
|
||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||
@SneakyThrows
|
||||
public void loadData(DocumentDbWriter ldbw, int id) {
|
||||
int[] factors = IntStream
|
||||
@ -305,22 +350,44 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
.filter(v -> (id % v) == 0)
|
||||
.toArray();
|
||||
|
||||
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
|
||||
|
||||
long fullId = fullId(id);
|
||||
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
|
||||
long[] data = new long[factors.length * 2];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
data[2 * i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
||||
data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||
|
||||
ldbw.add(new DocdbUrlDetail(
|
||||
fullId, new EdgeUrl("https://www.example.com/"+id),
|
||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||
));
|
||||
|
||||
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
long[] metadata = new long[factors.length];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||
ByteBuffer wa = ByteBuffer.allocate(32);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions[i] = GammaCodedSequence.generate(wa, factors);
|
||||
}
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
|
||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||
long fullId = UrlIdCodec.encodeId(domain, id);
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue());
|
||||
|
||||
ldbw.add(new DocdbUrlDetail(
|
||||
fullId, new EdgeUrl("https://www.example.com/"+id),
|
||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||
));
|
||||
|
||||
|
||||
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
long[] metadata = new long[factors.length];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
@ -334,30 +401,4 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
|
||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||
long fullId = UrlIdCodec.encodeId(domain, id);
|
||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue());
|
||||
|
||||
ldbw.add(new DocdbUrlDetail(
|
||||
fullId, new EdgeUrl("https://www.example.com/"+id),
|
||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||
));
|
||||
|
||||
|
||||
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
|
||||
long[] metadata = new long[factors.length];
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||
}
|
||||
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||
ByteBuffer wa = ByteBuffer.allocate(16);
|
||||
for (int i = 0; i < factors.length; i++) {
|
||||
positions[i] = GammaCodedSequence.generate(wa, i);
|
||||
}
|
||||
|
||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -565,6 +565,7 @@ public class IndexQueryServiceIntegrationTest {
|
||||
var header = new IndexJournalEntryHeader(
|
||||
doc,
|
||||
meta.features,
|
||||
100,
|
||||
meta.documentMetadata.encode()
|
||||
);
|
||||
|
||||
|
@ -1,100 +0,0 @@
|
||||
package nu.marginalia.ranking.results;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.model.idx.DocumentFlags;
|
||||
import nu.marginalia.model.idx.WordFlags;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import nu.marginalia.ranking.results.factors.*;
|
||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.mockito.Mockito;
|
||||
|
||||
import java.util.*;
|
||||
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
class ResultValuatorTest {
|
||||
|
||||
TermFrequencyDict dict;
|
||||
ResultValuator valuator;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() {
|
||||
|
||||
dict = Mockito.mock(TermFrequencyDict.class);
|
||||
when(dict.docCount()).thenReturn(100_000);
|
||||
|
||||
valuator = new ResultValuator(
|
||||
new TermCoherenceFactor()
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
CqDataInt frequencyData = new CqDataInt(new int[] { 10 });
|
||||
|
||||
CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just(
|
||||
new SearchResultKeywordScore("bob", 1,
|
||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)))
|
||||
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);
|
||||
|
||||
CompiledQueryLong highCountNoTitleSet = CompiledQuery.just(
|
||||
new SearchResultKeywordScore("bob", 1,
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)))
|
||||
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
|
||||
|
||||
CompiledQueryLong highCountSubjectSet = CompiledQuery.just(
|
||||
new SearchResultKeywordScore("bob", 1,
|
||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)))
|
||||
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
|
||||
|
||||
|
||||
@Test
|
||||
void evaluateTerms() {
|
||||
|
||||
when(dict.getTermFreq("bob")).thenReturn(10);
|
||||
ResultRankingContext context = new ResultRankingContext(100000,
|
||||
ResultRankingParameters.sensibleDefaults(),
|
||||
new BitSet(),
|
||||
new BitSet(),
|
||||
frequencyData,
|
||||
frequencyData);
|
||||
|
||||
long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class));
|
||||
int features = 0;
|
||||
|
||||
double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null);
|
||||
double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null);
|
||||
double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context, null);
|
||||
double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context, null);
|
||||
|
||||
System.out.println(titleOnlyLowCount);
|
||||
System.out.println(titleLongOnlyLowCount);
|
||||
System.out.println(highCountNoTitle);
|
||||
System.out.println(highCountSubject);
|
||||
}
|
||||
|
||||
private long docMetadata(int topology,
|
||||
int year,
|
||||
int quality,
|
||||
EnumSet<DocumentFlags> flags) {
|
||||
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
|
||||
}
|
||||
|
||||
private long wordMetadata(Set<Integer> positions, Set<WordFlags> wordFlags) {
|
||||
long posBits = positions.stream()
|
||||
.mapToLong(i -> ((1L << i) & 0xFF_FFFF_FFFF_FFFFL))
|
||||
.reduce((a,b) -> a|b)
|
||||
.orElse(0L);
|
||||
|
||||
return new WordMetadata(posBits, wordFlags).encode();
|
||||
}
|
||||
|
||||
}
|
@ -1,107 +0,0 @@
|
||||
package nu.marginalia.ranking.results.factors;
|
||||
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
||||
import nu.marginalia.model.idx.WordMetadata;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class TermCoherenceFactorTest {
|
||||
|
||||
TermCoherenceFactor termCoherenceFactor = new TermCoherenceFactor();
|
||||
@Test
|
||||
public void testAllBitsSet() {
|
||||
var allPositionsSet = createSet(
|
||||
~0L,
|
||||
~0L
|
||||
);
|
||||
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(
|
||||
allPositionsSet,
|
||||
SearchResultKeywordScore::positions
|
||||
);
|
||||
|
||||
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
||||
|
||||
assertEquals(1.0,
|
||||
termCoherenceFactor.calculateOverlap(
|
||||
allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)
|
||||
)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNoBitsSet() {
|
||||
var allPositionsSet = createSet(
|
||||
0, 0
|
||||
);
|
||||
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
|
||||
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
||||
|
||||
assertEquals(0, termCoherenceFactor.calculateOverlap(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)));
|
||||
}
|
||||
|
||||
@Test @SuppressWarnings("unchecked")
|
||||
public void testLowPosMatches() {
|
||||
var positions = createSet(
|
||||
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
|
||||
);
|
||||
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
printMask(mask);
|
||||
|
||||
}
|
||||
|
||||
@Test @SuppressWarnings("unchecked")
|
||||
public void testHiPosMatches() {
|
||||
var positions = createSet(
|
||||
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
|
||||
);
|
||||
|
||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
||||
printMask(mask);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBitMatchScaling() {
|
||||
for (int i = 1; i < 48; i++) {
|
||||
System.out.println(i + ":" + termCoherenceFactor.bitsSetFactor((1L << i) - 1));
|
||||
}
|
||||
}
|
||||
|
||||
void printMask(long mask) {
|
||||
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
|
||||
}
|
||||
|
||||
CompiledQuery<SearchResultKeywordScore> createSet(List<Integer>... maskPositions) {
|
||||
long[] positions = new long[maskPositions.length];
|
||||
|
||||
for (int i = 0; i < maskPositions.length; i++) {
|
||||
for (long pos : maskPositions[i]) {
|
||||
positions[i] |= (1L<<pos);
|
||||
}
|
||||
}
|
||||
|
||||
return createSet(positions);
|
||||
}
|
||||
|
||||
CompiledQuery<SearchResultKeywordScore> createSet(long... positionMasks) {
|
||||
List<SearchResultKeywordScore> keywords = new ArrayList<>();
|
||||
|
||||
for (int i = 0; i < positionMasks.length; i++) {
|
||||
keywords.add(new SearchResultKeywordScore("", 0,
|
||||
new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode()));
|
||||
}
|
||||
|
||||
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));
|
||||
}
|
||||
}
|
@ -17,12 +17,13 @@ public class EliasGammaCodec implements IntIterator {
|
||||
|
||||
private final BitReader reader;
|
||||
int rem = 0;
|
||||
private int last = 0;
|
||||
private int last;
|
||||
private int next = 0;
|
||||
|
||||
private EliasGammaCodec(ByteBuffer buffer) {
|
||||
private EliasGammaCodec(ByteBuffer buffer, int zero) {
|
||||
reader = new BitReader(buffer);
|
||||
|
||||
last = zero;
|
||||
int bits = reader.takeWhileZero();
|
||||
|
||||
if (!reader.hasMore()) {
|
||||
@ -33,9 +34,24 @@ public class EliasGammaCodec implements IntIterator {
|
||||
}
|
||||
}
|
||||
|
||||
public static int readCount(ByteBuffer buffer) {
|
||||
var reader = new BitReader(buffer);
|
||||
|
||||
if (reader.getCurrentValue() > 0) {
|
||||
int bits = reader.takeWhileZero();
|
||||
return reader.get(bits);
|
||||
}
|
||||
else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
|
||||
public static IntIterator decode(ByteBuffer buffer) {
|
||||
return new EliasGammaCodec(buffer);
|
||||
return new EliasGammaCodec(buffer, 0);
|
||||
}
|
||||
public static IntIterator decodeWithOffset(ByteBuffer buffer, int offset) {
|
||||
return new EliasGammaCodec(buffer, offset);
|
||||
}
|
||||
|
||||
/** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code.
|
||||
|
@ -16,6 +16,7 @@ import java.util.StringJoiner;
|
||||
* */
|
||||
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
|
||||
private final ByteBuffer raw;
|
||||
|
||||
int startPos = 0;
|
||||
int startLimit = 0;
|
||||
|
||||
@ -43,6 +44,12 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
startLimit = bytes.limit();
|
||||
}
|
||||
|
||||
public GammaCodedSequence(ByteBuffer bytes, int startPos, int startLimit) {
|
||||
this.raw = bytes;
|
||||
this.startPos = startPos;
|
||||
this.startLimit = startLimit;
|
||||
}
|
||||
|
||||
public GammaCodedSequence(byte[] bytes) {
|
||||
raw = ByteBuffer.allocate(bytes.length);
|
||||
raw.put(bytes);
|
||||
@ -72,6 +79,18 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
return EliasGammaCodec.decode(raw);
|
||||
}
|
||||
|
||||
/** Return an iterator over the sequence with a constant offset applied to each value.
|
||||
* This is useful for comparing sequences with different offsets, and adds zero
|
||||
* extra cost to the decoding process which is already based on adding
|
||||
* relative differences.
|
||||
* */
|
||||
public IntIterator offsetIterator(int offset) {
|
||||
raw.position(startPos);
|
||||
raw.limit(startLimit);
|
||||
|
||||
return EliasGammaCodec.decodeWithOffset(raw, offset);
|
||||
}
|
||||
|
||||
public IntList values() {
|
||||
var intItr = iterator();
|
||||
IntArrayList ret = new IntArrayList(8);
|
||||
@ -81,18 +100,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Decode the sequence into an IntList;
|
||||
* this is a somewhat slow operation,
|
||||
* iterating over the data directly more performant */
|
||||
public IntList decode() {
|
||||
IntArrayList ret = new IntArrayList(8);
|
||||
var iter = iterator();
|
||||
while (iter.hasNext()) {
|
||||
ret.add(iter.nextInt());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return raw.hashCode();
|
||||
}
|
||||
@ -116,7 +123,11 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
||||
return raw;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
public int bufferSize() {
|
||||
return raw.capacity();
|
||||
}
|
||||
|
||||
public int valueCount() {
|
||||
return EliasGammaCodec.readCount(buffer());
|
||||
}
|
||||
}
|
||||
|
@ -0,0 +1,86 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
|
||||
public class SequenceOperations {
|
||||
|
||||
/** Return true if the sequences intersect, false otherwise.
|
||||
* */
|
||||
public static boolean intersectSequences(IntIterator... sequences) {
|
||||
|
||||
if (sequences.length <= 1)
|
||||
return true;
|
||||
|
||||
// Initialize values and find the maximum value
|
||||
int[] values = new int[sequences.length];
|
||||
|
||||
for (int i = 0; i < sequences.length; i++) {
|
||||
if (sequences[i].hasNext())
|
||||
values[i] = sequences[i].nextInt();
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Intersect the sequences by advancing all values smaller than the maximum seen so far
|
||||
// until they are equal to the maximum value, or until the end of the sequence is reached
|
||||
int max = Integer.MIN_VALUE;
|
||||
int successes = 0;
|
||||
for (int i = 0; successes < sequences.length; i = (i + 1) % sequences.length)
|
||||
{
|
||||
if (values[i] == max) {
|
||||
successes++;
|
||||
} else {
|
||||
successes = 0;
|
||||
|
||||
// Discard values until we reach the maximum value seen so far,
|
||||
// or until the end of the sequence is reached
|
||||
while (values[i] < max) {
|
||||
if (sequences[i].hasNext())
|
||||
values[i] = sequences[i].nextInt();
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Update the maximum value, if necessary
|
||||
max = Math.max(max, values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
|
||||
* */
|
||||
public static int minDistance(IntIterator seqA, IntIterator seqB)
|
||||
{
|
||||
int minDistance = Integer.MAX_VALUE;
|
||||
|
||||
if (!seqA.hasNext() || !seqB.hasNext())
|
||||
return -1;
|
||||
|
||||
int a = seqA.nextInt();
|
||||
int b = seqB.nextInt();
|
||||
|
||||
while (true) {
|
||||
int distance = Math.abs(a - b);
|
||||
if (distance < minDistance)
|
||||
minDistance = distance;
|
||||
|
||||
if (a <= b) {
|
||||
if (seqA.hasNext()) {
|
||||
a = seqA.nextInt();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
if (seqB.hasNext()) {
|
||||
b = seqB.nextInt();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return minDistance;
|
||||
}
|
||||
}
|
@ -20,6 +20,10 @@ public class BitReader {
|
||||
this.currentValue = 0;
|
||||
}
|
||||
|
||||
public long getCurrentValue() {
|
||||
return currentValue;
|
||||
}
|
||||
|
||||
/** Read the next bit from the buffer */
|
||||
public boolean getBit() {
|
||||
if (bitPosition <= 0) {
|
||||
|
@ -0,0 +1,75 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class SequenceOperationsTest {
|
||||
|
||||
@Test
|
||||
void intersectSequencesSingle() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesTrivialMatch() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 1);
|
||||
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesTrivialMismatch() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2);
|
||||
|
||||
assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesOffsetMatch() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 3);
|
||||
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.offsetIterator(0), seq2.offsetIterator(-2)));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesDeepMatch() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14);
|
||||
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesDeepMatch3() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14);
|
||||
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9);
|
||||
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesDeepMismatch() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 14);
|
||||
|
||||
assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||
}
|
||||
|
||||
}
|
@ -26,6 +26,8 @@ public class DocumentRecordKeywordsProjection {
|
||||
public int htmlFeatures;
|
||||
public long documentMetadata;
|
||||
|
||||
public int length;
|
||||
|
||||
public List<String> words;
|
||||
public TLongList metas;
|
||||
public List<GammaCodedSequence> positions;
|
||||
@ -39,13 +41,14 @@ public class DocumentRecordKeywordsProjection {
|
||||
}
|
||||
|
||||
public static Collection<String> requiredColumns() {
|
||||
return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata");
|
||||
return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata", "length");
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public DocumentRecordKeywordsProjection add(String heading, Object value) {
|
||||
switch (heading) {
|
||||
case "domain" -> domain = (String) value;
|
||||
case "length" -> length = (Integer) value;
|
||||
case "ordinal" -> ordinal = (Integer) value;
|
||||
case "htmlFeatures" -> htmlFeatures = (Integer) value;
|
||||
case "documentMetadata" -> documentMetadata = (Long) value;
|
||||
|
@ -6,12 +6,10 @@ import lombok.SneakyThrows;
|
||||
import nu.marginalia.IndexLocations;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||
import nu.marginalia.storage.FileStorageService;
|
||||
import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
@ -41,18 +39,11 @@ public class LoaderIndexJournalWriter {
|
||||
indexWriter = new IndexJournalWriterPagingImpl(indexArea);
|
||||
}
|
||||
|
||||
public void putWords(long combinedId,
|
||||
int features,
|
||||
DocumentMetadata metadata,
|
||||
DocumentKeywords wordSet) {
|
||||
|
||||
putWords(combinedId, features, metadata.encode(), wordSet);
|
||||
}
|
||||
|
||||
@SneakyThrows
|
||||
public void putWords(long combinedId,
|
||||
int features,
|
||||
long metadata,
|
||||
int length,
|
||||
DocumentKeywords wordSet) {
|
||||
|
||||
if (wordSet.isEmpty()) {
|
||||
@ -65,7 +56,7 @@ public class LoaderIndexJournalWriter {
|
||||
return;
|
||||
}
|
||||
|
||||
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
|
||||
var header = new IndexJournalEntryHeader(combinedId, features, length, metadata);
|
||||
var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions);
|
||||
|
||||
indexWriter.put(header, data);
|
||||
|
@ -75,6 +75,7 @@ public class KeywordLoaderService {
|
||||
writer.putWords(combinedId,
|
||||
projection.htmlFeatures,
|
||||
projection.documentMetadata,
|
||||
projection.length,
|
||||
words);
|
||||
}
|
||||
}
|
@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
||||
long positions)
|
||||
{
|
||||
results.add(new DecoratedSearchResultItem(
|
||||
new SearchResultItem(url.hashCode(), 2, 3, false),
|
||||
new SearchResultItem(url.hashCode(), 2, 3),
|
||||
new EdgeUrl(url),
|
||||
title,
|
||||
description,
|
||||
|
Loading…
Reference in New Issue
Block a user