(index) Integrate positions data with indexes WIP

This change integrates the new positions data with the forward and reverse indexes.

The ranking code is still only partially re-written.
This commit is contained in:
Viktor Lofgren 2024-06-10 15:09:06 +02:00
parent 9f982a0c3d
commit 36160988e2
58 changed files with 1417 additions and 650 deletions

View File

@ -5,8 +5,8 @@ import java.util.stream.IntStream;
/** A compiled index service query */
public class CompiledQueryInt {
private final CqExpression root;
private final CqDataInt data;
public final CqExpression root;
public final CqDataInt data;
public CompiledQueryInt(CqExpression root, CqDataInt data) {
this.root = root;
@ -26,7 +26,7 @@ public class CompiledQueryInt {
return IntStream.range(0, data.size());
}
public long at(int index) {
public int at(int index) {
return data.get(index);
}

View File

@ -61,7 +61,8 @@ public class CompiledQueryParser {
String[] cqData = new String[wordIds.size()];
wordIds.forEach((w, i) -> cqData[i] = w);
return new CompiledQuery<>(root, new CqData<>(cqData));
return root.newQuery(cqData);
}

View File

@ -8,6 +8,18 @@ import java.util.stream.Stream;
*
*/
public sealed interface CqExpression {
/** Create a new query for the provided data using this expression as the root */
default <T> CompiledQuery<T> newQuery(T[] data) {
return new CompiledQuery<>(this, data);
}
/** Create a new query for the provided data using this expression as the root */
default CompiledQueryInt newQuery(int[] data) {
return new CompiledQueryInt(this, new CqDataInt(data));
}
/** Create a new query for the provided data using this expression as the root */
default CompiledQueryLong newQuery(long[] data) {
return new CompiledQueryLong(this, new CqDataLong(data));
}
Stream<Word> stream();

View File

@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import java.util.ArrayList;
@ -36,7 +37,10 @@ public class CompiledQueryAggregates {
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static <T> int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));
}
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
return query.root.visit(new CqIntMaxMinOperator(query, operator));

View File

@ -1,6 +1,7 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) {
this.operator = idx -> operator.applyAsInt(query.at(idx));
}
@Override
public int onAnd(List<? extends CqExpression> parts) {
int value = parts.getFirst().visit(this);

View File

@ -36,6 +36,10 @@ public class SearchQuery {
@Deprecated // why does this exist?
private double value = 0;
public static SearchQueryBuilder builder(String compiledQuery) {
return new SearchQueryBuilder(compiledQuery);
}
public SearchQuery() {
this.compiledQuery = "";
this.searchTermsInclude = new ArrayList<>();
@ -81,5 +85,45 @@ public class SearchQuery {
return sb.toString();
}
public static class SearchQueryBuilder {
private final String compiledQuery;
private List<String> searchTermsInclude = new ArrayList<>();
private List<String> searchTermsExclude = new ArrayList<>();
private List<String> searchTermsAdvice = new ArrayList<>();
private List<String> searchTermsPriority = new ArrayList<>();
private List<List<String>> searchTermCoherences = new ArrayList<>();
private SearchQueryBuilder(String compiledQuery) {
this.compiledQuery = compiledQuery;
}
public SearchQueryBuilder include(String... terms) {
searchTermsInclude.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder exclude(String... terms) {
searchTermsExclude.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder advice(String... terms) {
searchTermsAdvice.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder priority(String... terms) {
searchTermsPriority.addAll(List.of(terms));
return this;
}
public SearchQueryBuilder coherences(String... coherences) {
searchTermCoherences.add(List.of(coherences));
return this;
}
public SearchQuery build() {
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
}
}
}

View File

@ -32,13 +32,11 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
public SearchResultItem(long combinedId,
long encodedDocMetadata,
int htmlFeatures,
boolean hasPrioTerm) {
int htmlFeatures) {
this.combinedId = combinedId;
this.encodedDocMetadata = encodedDocMetadata;
this.keywordScores = new ArrayList<>();
this.htmlFeatures = htmlFeatures;
this.hasPrioTerm = hasPrioTerm;
}

View File

@ -83,8 +83,10 @@ public class ForwardIndexConverter {
int ranking = domainRankings.getRanking(domainId);
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L);
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures());
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features);
}
progress.progress(TaskSteps.FORCE);

View File

@ -82,9 +82,19 @@ public class ForwardIndexReader {
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) & 0xFFFF_FFFFL);
}
public int getDocumentSize(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
long offset = idxForDoc(docId);
if (offset < 0) return 0;
return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) >>> 32L);
}
private int idxForDoc(long docId) {
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";

View File

@ -79,6 +79,7 @@ class ForwardIndexConverterTest {
writer.put(
new IndexJournalEntryHeader(createId(id, id/20),
id%3,
15,
(id % 5)),
new IndexJournalEntryData(
new String[]{},

View File

@ -17,14 +17,17 @@ import nu.marginalia.model.idx.DocumentMetadata;
*/
public record IndexJournalEntryHeader(int entrySize,
int documentFeatures,
int documentSize,
long combinedId,
long documentMeta) {
public IndexJournalEntryHeader(long combinedId,
int documentFeatures,
int documentSize,
long documentMeta) {
this(-1,
documentFeatures,
documentSize,
combinedId,
documentMeta);
}

View File

@ -28,12 +28,17 @@ public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
final long sizeBlock = inputStream.readLong();
final int entrySize = (int) (sizeBlock >>> 48L);
final int docSize = (int) ((sizeBlock >>> 32L) & 0xFFFFL);
final int docFeatures = (int) (sizeBlock & 0xFFFF_FFFFL);
final long docId = inputStream.readLong();
final long meta = inputStream.readLong();
var header = new IndexJournalEntryHeader(
(int) (sizeBlock >>> 32L),
(int) (sizeBlock & 0xFFFF_FFFFL),
entrySize,
docFeatures,
docSize,
docId,
meta);
@ -57,6 +62,10 @@ public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData
return header.documentFeatures();
}
public int documentSize() {
return header.documentSize();
}
public int domainId() {
return UrlIdCodec.getDomainId(docId());
}
@ -88,7 +97,7 @@ class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
public IndexJournalEntryTermData next() {
// read the metadata for the term
long termId = buffer.getLong();
long meta = buffer.getLong();
long meta = buffer.getShort();
// read the size of the sequence data
int size = buffer.get() & 0xFF;

View File

@ -13,7 +13,7 @@ public interface IndexJournalReader {
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
int DOCUMENT_HEADER_SIZE_BYTES = 24;
int TERM_HEADER_SIZE_BYTES = 17;
int TERM_HEADER_SIZE_BYTES = 11;
/** Create a reader for a single file. */
static IndexJournalReader singleFile(Path fileName) throws IOException {

View File

@ -97,6 +97,9 @@ class SingleFileJournalPointer implements IndexJournalPointer {
@Override
public int documentFeatures() { return entry.documentFeatures(); }
@Override
public int documentSize() { return entry.documentSize(); }
/** Return an iterator over the terms in the current document.
* This iterator is not valid after calling nextDocument().
*/

View File

@ -42,6 +42,8 @@ public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>
*/
int documentFeatures();
int documentSize();
/** Concatenate a number of journal pointers */
static IndexJournalPointer concatenate(IndexJournalPointer... pointers) {
if (pointers.length == 1)
@ -94,6 +96,11 @@ class JoiningJournalPointer implements IndexJournalPointer {
return pointers[pIndex].documentFeatures();
}
@Override
public int documentSize() {
return pointers[pIndex].documentSize();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {
@ -146,6 +153,12 @@ class FilteringJournalPointer implements IndexJournalPointer {
return base.documentFeatures();
}
@Override
public int documentSize() {
return base.documentSize();
}
@NotNull
@Override
public Iterator<IndexJournalEntryTermData> iterator() {

View File

@ -2,7 +2,6 @@ package nu.marginalia.index.journal.writer;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import java.io.IOException;

View File

@ -81,12 +81,6 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
public int put(IndexJournalEntryHeader header,
IndexJournalEntryData data)
{
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
}
final long[] keywords = data.termIds();
final long[] metadata = data.metadata();
final var positions = data.positions();
@ -94,16 +88,30 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
int recordSize = 0; // document header size is 3 longs
for (int i = 0; i < keywords.length; i++) {
// term header size is 2 longs
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize();
}
dataBuffer.putInt(recordSize);
if (recordSize > Short.MAX_VALUE) {
// This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file
// (32 KB is *a lot* of data for a single document, larger than the uncompressed HTML of most documents)
logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", recordSize, Short.MAX_VALUE);
return 0;
}
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
dataBuffer.flip();
compressingStream.compress(dataBuffer);
dataBuffer.clear();
}
dataBuffer.putShort((short) recordSize);
dataBuffer.putShort((short) Math.clamp(0, header.documentSize(), Short.MAX_VALUE));
dataBuffer.putInt(header.documentFeatures());
dataBuffer.putLong(header.combinedId());
dataBuffer.putLong(header.documentMeta());
for (int i = 0; i < keywords.length; i++) {
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize();
if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) {
dataBuffer.flip();
@ -112,8 +120,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
}
dataBuffer.putLong(keywords[i]);
dataBuffer.putLong(metadata[i]);
dataBuffer.put((byte) positions[i].size());
dataBuffer.putShort((short) metadata[i]);
dataBuffer.put((byte) positions[i].bufferSize());
dataBuffer.put(positions[i].buffer());
}

View File

@ -1,6 +1,8 @@
package nu.marginalia.index.journal;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
@ -8,6 +10,11 @@ import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Assertions;
@ -18,8 +25,9 @@ import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.List;
import java.util.*;
import java.util.stream.Collectors;
import java.util.stream.IntStream;
import static org.junit.jupiter.api.Assertions.*;
@ -52,7 +60,7 @@ public class IndexJournalWriterTest {
public void testSingleFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 33),
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
@ -61,7 +69,7 @@ public class IndexJournalWriterTest {
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 34),
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
@ -90,6 +98,7 @@ public class IndexJournalWriterTest {
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
assertEquals(10, ptr.documentSize());
iter = ptr.iterator();
@ -116,6 +125,7 @@ public class IndexJournalWriterTest {
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
@ -147,7 +157,7 @@ public class IndexJournalWriterTest {
@Test
public void testMultiFile() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
writer.put(new IndexJournalEntryHeader(11, 22, 33),
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
@ -162,7 +172,7 @@ public class IndexJournalWriterTest {
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
writer.put(new IndexJournalEntryHeader(12, 23, 34),
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
@ -191,6 +201,7 @@ public class IndexJournalWriterTest {
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(33, ptr.documentMeta());
assertEquals(10, ptr.documentSize());
iter = ptr.iterator();
@ -217,6 +228,7 @@ public class IndexJournalWriterTest {
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
@ -249,7 +261,7 @@ public class IndexJournalWriterTest {
public void testSingleFileIterTwice() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 33),
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
@ -277,6 +289,7 @@ public class IndexJournalWriterTest {
assertTrue(ptr.nextDocument());
assertEquals(11, ptr.documentId());
assertEquals(22, ptr.documentFeatures());
assertEquals(10, ptr.documentSize());
assertEquals(33, ptr.documentMeta());
iter = ptr.iterator();
@ -307,7 +320,7 @@ public class IndexJournalWriterTest {
public void testFiltered() {
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
// Write two documents with two terms each
writer.put(new IndexJournalEntryHeader(11, 22, 33),
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{44, 55},
@ -316,7 +329,7 @@ public class IndexJournalWriterTest {
gcs(2, 4, 6),
})
);
writer.put(new IndexJournalEntryHeader(12, 23, 34),
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
new IndexJournalEntryData(
new String[]{"word1", "word2"},
new long[]{45, 56},
@ -344,6 +357,7 @@ public class IndexJournalWriterTest {
assertEquals(12, ptr.documentId());
assertEquals(23, ptr.documentFeatures());
assertEquals(34, ptr.documentMeta());
assertEquals(11, ptr.documentSize());
iter = ptr.iterator();
// Term 1
@ -364,4 +378,72 @@ public class IndexJournalWriterTest {
}
}
@Test
public void testIntegrationScenario() throws IOException {
Map<Long, Integer> wordMap = new HashMap<>();
for (int i = 0; i < 512; i++) {
wordMap.put(hasher.hashKeyword(Integer.toString(i)), i);
}
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
for (int idc = 1; idc < 512; idc++) {
int id = idc;
int[] factors = IntStream
.rangeClosed(1, id)
.filter(v -> (id % v) == 0)
.toArray();
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i + 1);
}
writer.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
}
try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) {
while (ptr.nextDocument()) {
int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId());
System.out.println(ordinal);
var expectedFactors =
new LongArrayList(IntStream
.rangeClosed(1, ordinal)
.filter(v -> (ordinal % v) == 0)
.mapToObj(Integer::toString)
.mapToLong(hasher::hashKeyword)
.toArray());
LongList foundIds = new LongArrayList();
var iter = ptr.iterator();
while (iter.hasNext()) {
var termData = iter.next();
foundIds.add(termData.termId());
}
if (!expectedFactors.equals(foundIds)) {
System.out.println("Found: ");
System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
System.out.println("Expected: ");
System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
fail();
}
assertEquals(expectedFactors, foundIds);
}
}
}
}

View File

@ -3,6 +3,8 @@ package nu.marginalia.index;
import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.btree.BTreeReader;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.index.query.EmptyEntrySource;
import nu.marginalia.index.query.EntrySource;
import nu.marginalia.index.query.ReverseIndexRejectFilter;
@ -14,9 +16,9 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.concurrent.Executors;
public class ReverseIndexReader {
@ -27,9 +29,16 @@ public class ReverseIndexReader {
private final BTreeReader wordsBTreeReader;
private final String name;
public ReverseIndexReader(String name, Path words, Path documents) throws IOException {
private final PositionsFileReader positionsFileReader;
public ReverseIndexReader(String name,
Path words,
Path documents,
PositionsFileReader positionsFileReader) throws IOException {
this.name = name;
this.positionsFileReader = positionsFileReader;
if (!Files.exists(words) || !Files.exists(documents)) {
this.words = null;
this.documents = null;
@ -133,31 +142,29 @@ public class ReverseIndexReader {
offset);
}
public long[] getTermMeta(long termId, long[] docIds) {
public TermData[] getTermData(Arena arena,
long termId,
long[] docIds)
{
var ret = new TermData[docIds.length];
long offset = wordOffset(termId);
if (offset < 0) {
// This is likely a bug in the code, but we can't throw an exception here
logger.debug("Missing offset for word {}", termId);
return new long[docIds.length];
return ret;
}
assert isUniqueAndSorted(docIds) : "The input array docIds is assumed to be unique and sorted, was " + Arrays.toString(docIds);
var reader = createReaderNew(offset);
return reader.queryData(docIds, 1);
}
private boolean isUniqueAndSorted(long[] ids) {
if (ids.length == 0)
return true;
// Read the size and offset of the position data
var offsets = reader.queryData(docIds, 1);
for (int i = 1; i < ids.length; i++) {
if(ids[i] <= ids[i-1])
return false;
for (int i = 0; i < docIds.length; i++) {
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
}
return true;
return ret;
}
public void close() {
@ -166,5 +173,14 @@ public class ReverseIndexReader {
if (words != null)
words.close();
if (positionsFileReader != null) {
try {
positionsFileReader.close();
} catch (IOException e) {
logger.error("Failed to close positions file reader", e);
}
}
}
}

View File

@ -1,5 +1,6 @@
package nu.marginalia.index.construction;
import nu.marginalia.index.positions.PositionCodec;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
@ -38,7 +39,7 @@ public class PositionsFileConstructor implements AutoCloseable {
/** Add a term to the positions file
* @param termMeta the term metadata
* @param positions the positions of the term
* @return the offset of the term in the file
* @return the offset of the term in the file, with the size of the data in the highest byte
*/
public long add(byte termMeta, GammaCodedSequence positions) throws IOException {
synchronized (file) {
@ -53,12 +54,20 @@ public class PositionsFileConstructor implements AutoCloseable {
workBuffer.put(termMeta);
workBuffer.put(positionBuffer);
long ret = PositionCodec.encode(size, offset);
offset += size;
return offset;
return ret;
}
}
public void close() throws IOException {
while (workBuffer.position() < workBuffer.limit()) {
workBuffer.flip();
channel.write(workBuffer);
}
channel.force(false);
channel.close();
}

View File

@ -7,7 +7,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.concurrent.atomic.AtomicInteger;

View File

@ -21,12 +21,14 @@ import java.util.concurrent.TimeUnit;
* the associated ReversePreindexWordSegments data
*/
public class ReversePreindexDocuments {
public final LongArray documents;
private static PositionsFileConstructor positionsFileConstructor;
final Path file;
public final LongArray documents;
private static final int RECORD_SIZE_LONGS = 2;
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
public final Path file;
public ReversePreindexDocuments(LongArray documents, Path file) {
this.documents = documents;
this.file = file;
@ -70,22 +72,25 @@ public class ReversePreindexDocuments {
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs)) {
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
var pointer = reader.newPointer())
{
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
offsetMap.defaultReturnValue(0);
var pointer = reader.newPointer();
while (pointer.nextDocument()) {
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
for (var termData : pointer) {
long termId = termData.termId();
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
// write position data to the positions file and get the offset
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
assembly.put(offset + 0, rankEncodedId);
assembly.put(offset + 1, posOffset);
assembly.put(offset + 1, encodedPosOffset);
}
}

View File

@ -0,0 +1,25 @@
package nu.marginalia.index.positions;
/** A utility class for encoding and decoding position data offsets,
* the data is encoded by using the highest 16 bits to store the offset,
* and the remaining 48 bits to store the size of the data.
* <p></p>
* This lets us address 256 TB of data, with up to 64 KB of position data for each term,
* which is ample headroom for both the size of the data and the number of positions.
* */
public class PositionCodec {
public static long encode(int length, long offset) {
assert decodeSize(offset) == 0 : "Offset must be less than 2^48";
return (long) length << 48 | offset;
}
public static int decodeSize(long sizeEncodedOffset) {
return (int) ((sizeEncodedOffset & 0xFFFF_0000_0000_0000L) >>> 48);
}
public static long decodeOffset(long sizeEncodedOffset) {
return sizeEncodedOffset & 0x0000_FFFF_FFFF_FFFFL;
}
}

View File

@ -0,0 +1,39 @@
package nu.marginalia.index.positions;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.channels.FileChannel;
import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
public class PositionsFileReader implements AutoCloseable {
private final FileChannel positions;
public PositionsFileReader(Path positionsFile) throws IOException {
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
}
/** Get the positions for a term in the index, as pointed out by the encoded offset;
* intermediate buffers are allocated from the provided arena allocator. */
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
int length = PositionCodec.decodeSize(sizeEncodedOffset);
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
var segment = arena.allocate(length);
var buffer = segment.asByteBuffer();
try {
positions.read(buffer, offset);
} catch (IOException e) {
throw new RuntimeException(e);
}
return new TermData(buffer);
}
@Override
public void close() throws IOException {
positions.close();
}
}

View File

@ -0,0 +1,21 @@
package nu.marginalia.index.positions;
import nu.marginalia.sequence.GammaCodedSequence;
import java.nio.ByteBuffer;
public class TermData {
private final ByteBuffer buffer;
public TermData(ByteBuffer buffer) {
this.buffer = buffer;
}
public byte flags() {
return buffer.get(0);
}
public GammaCodedSequence positions() {
return new GammaCodedSequence(buffer, 1, buffer.capacity());
}
}

View File

@ -0,0 +1,63 @@
package nu.marginalia.index;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.*;
class PositionsFileReaderTest {
Path file;
@BeforeEach
void setUp() throws IOException {
file = Files.createTempFile("positions", "dat");
}
@AfterEach
void tearDown() throws IOException {
Files.delete(file);
}
@Test
void getTermData() throws IOException {
ByteBuffer workArea = ByteBuffer.allocate(8192);
long key1, key2, key3;
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
key1 = constructor.add((byte) 43, GammaCodedSequence.generate(workArea, 1, 2, 3));
key2 = constructor.add((byte) 51, GammaCodedSequence.generate(workArea, 2, 3, 5, 1000, 5000, 20241));
key3 = constructor.add((byte) 61, GammaCodedSequence.generate(workArea, 3, 5, 7));
}
System.out.println("key1: " + Long.toHexString(key1));
System.out.println("key2: " + Long.toHexString(key2));
System.out.println("key3: " + Long.toHexString(key3));
try (Arena arena = Arena.ofConfined();
PositionsFileReader reader = new PositionsFileReader(file))
{
TermData data1 = reader.getTermData(arena, key1);
assertEquals(43, data1.flags());
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
TermData data2 = reader.getTermData(arena, key2);
assertEquals(51, data2.flags());
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
TermData data3 = reader.getTermData(arena, key3);
assertEquals(61, data3.flags());
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
}
}
}

View File

@ -1,17 +1,19 @@
package nu.marginalia.index;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.PositionsFileConstructor;
import nu.marginalia.index.construction.ReversePreindex;
import nu.marginalia.index.construction.TestJournalFactory;
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
import nu.marginalia.index.positions.PositionsFileReader;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
@ -47,13 +49,18 @@ class ReverseIndexReaderTest {
public void testSimple() throws IOException {
var indexReader = createIndex(
new EntryDataWithWordMeta(100, 101, wm(50, 51))
new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5))
);
assertEquals(1, indexReader.numDocuments(50));
long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
assertArrayEquals(new long[] { 51 }, meta);
var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 });
assertEquals(1, positions.length);
assertNotNull(positions[0]);
assertEquals((byte) 51, positions[0].flags());
assertEquals(IntList.of(1, 3, 5), positions[0].positions().values());
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
}
@ -69,13 +76,8 @@ class ReverseIndexReaderTest {
assertEquals(2, indexReader.numDocuments(51));
assertEquals(1, indexReader.numDocuments(52));
assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
}
@ -91,18 +93,20 @@ class ReverseIndexReaderTest {
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
var reader = journalFactory.createReader(scenario);
var preindex = ReversePreindex.constructPreindex(reader,
Mockito.mock(PositionsFileConstructor.class),
DocIdRewriter.identity(), tempDir);
Path posFile = tempDir.resolve("positions.dat");
Path docsFile = tempDir.resolve("docs.dat");
Path wordsFile = tempDir.resolve("words.dat");
preindex.finalizeIndex(docsFile, wordsFile);
preindex.delete();
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
var preindex = ReversePreindex.constructPreindex(reader,
positionsFileConstructor,
DocIdRewriter.identity(), tempDir);
preindex.finalizeIndex(docsFile, wordsFile);
preindex.delete();
}
return new ReverseIndexReader("test", wordsFile, docsFile);
return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
}
}

View File

@ -155,15 +155,15 @@ class ReversePreindexDocsTest {
if (wordId != that.wordId) return false;
if (start != that.start) return false;
if (end != that.end) return false;
return Arrays.equals(data, that.data);
return data[0] == that.data[0]; //Arrays.equals(data, that.data);
}
@Override
public int hashCode() {
int result = (int) (wordId ^ (wordId >>> 32));
result = 31 * result + (int) (start ^ (start >>> 32));
result = 31 * result + (int) (end ^ (end >>> 32));
result = 31 * result + Arrays.hashCode(data);
int result = Long.hashCode(wordId);
result = 31 * result + Long.hashCode(start);
result = 31 * result + Long.hashCode(end);
result = 31 * result + Long.hashCode(data[0]);
return result;
}

View File

@ -79,9 +79,7 @@ class ReversePreindexFinalizeTest {
assertEquals(1, wordsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
}
@ -122,9 +120,7 @@ class ReversePreindexFinalizeTest {
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
BTreeHeader docsHeader;
@ -133,13 +129,11 @@ class ReversePreindexFinalizeTest {
assertEquals(1, docsHeader.numEntries());
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
docsHeader = new BTreeHeader(docsArray, offset2);
System.out.println(docsHeader);
assertEquals(1, docsHeader.numEntries());
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1));
}
}

View File

@ -8,11 +8,13 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
public class TestJournalFactory {
Path tempDir = Files.createTempDirectory("journal");
@ -50,10 +52,10 @@ public class TestJournalFactory {
'}';
}
}
public record WordWithMeta(long wordId, long meta) {}
public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {}
public static WordWithMeta wm(long wordId, long meta) {
return new WordWithMeta(wordId, meta);
public static WordWithMeta wm(long wordId, long meta, int... positions) {
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
}
IndexJournalReader createReader(EntryData... entries) throws IOException {
@ -71,7 +73,7 @@ public class TestJournalFactory {
positions[i] = new GammaCodedSequence(new byte[1]);
}
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
new IndexJournalEntryData(termIds, meta, positions));
}
writer.close();
@ -91,10 +93,10 @@ public class TestJournalFactory {
for (int i = 0; i < entry.wordIds.length; i++) {
termIds[i] = entry.wordIds[i].wordId;
meta[i] = entry.wordIds[i].meta;
positions[i] = new GammaCodedSequence(new byte[1]);
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
}
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
new IndexJournalEntryData(termIds, meta, positions));
}
writer.close();

View File

@ -4,11 +4,10 @@ import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.positions.PositionsFileReader;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.ForwardIndexReader;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.nio.file.Files;
@ -40,17 +39,18 @@ public class IndexFactory {
}
public ReverseIndexReader getReverseIndexReader() throws IOException {
return new ReverseIndexReader("full",
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT)
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
);
}
public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
return new ReverseIndexReader("prio",
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
null
);
}

View File

@ -281,10 +281,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
awaitCompletion();
// Return the best results
return new SearchResultSet(
resultValuator.selectBestResults(parameters,
resultRankingContext,
resultHeap));
return new SearchResultSet(resultValuator.selectBestResults(parameters, resultHeap));
}
/** Wait for all tasks to complete */

View File

@ -14,12 +14,13 @@ import nu.marginalia.index.query.IndexQueryBuilder;
import nu.marginalia.index.query.filter.QueryFilterStepIf;
import nu.marginalia.index.query.limit.SpecificationLimitType;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.DocMetadataList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Collections;
@ -169,8 +170,11 @@ public class CombinedIndexReader {
}
/** Retrieves the term metadata for the specified word for the provided documents */
public DocMetadataList getMetadata(long wordId, CombinedDocIdList docIds) {
return new DocMetadataList(reverseIndexFullReader.getTermMeta(wordId, docIds.array()));
public TermMetadataList getTermMetadata(Arena arena,
long wordId,
CombinedDocIdList docIds)
{
return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
}
/** Retrieves the document metadata for the specified document */
@ -186,8 +190,12 @@ public class CombinedIndexReader {
/** Retrieves the HTML features for the specified document */
public int getHtmlFeatures(long docId) {
return forwardIndexReader.getHtmlFeatures(docId);
} /** Retrieves the HTML features for the specified document */
public int getDocumentSize(long docId) {
return forwardIndexReader.getDocumentSize(docId);
}
/** Close the indexes (this is not done immediately)
* */
public void close() throws InterruptedException {

View File

@ -10,12 +10,13 @@ import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.index.results.model.ids.TermIdList;
import java.lang.foreign.Arena;
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
public class IndexMetadataService {
private final StatefulIndex statefulIndex;
@ -25,22 +26,19 @@ public class IndexMetadataService {
this.statefulIndex = index;
}
public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll,
TermIdList termIdsList)
public Long2ObjectArrayMap<TermMetadataList>
getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList)
{
var currentIndex = statefulIndex.get();
Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta =
Long2ObjectArrayMap<TermMetadataList> termdocToMeta =
new Long2ObjectArrayMap<>(termIdsList.size());
for (long termId : termIdsList.array()) {
var metadata = currentIndex.getMetadata(termId, combinedIdsAll);
termdocToMeta.put(termId,
new DocumentsWithMetadata(combinedIdsAll, metadata));
termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll));
}
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
return termdocToMeta;
}
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {

View File

@ -1,25 +1,22 @@
package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.*;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.ranking.results.ResultValuator;
import nu.marginalia.sequence.GammaCodedSequence;
import javax.annotation.Nullable;
import java.util.List;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
/** This class is responsible for calculating the score of a search result.
* It holds the data required to perform the scoring, as there is strong
@ -28,94 +25,74 @@ public class IndexResultValuationContext {
private final CombinedIndexReader index;
private final QueryParams queryParams;
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
private final QuerySearchTerms searchTerms;
private final ResultRankingContext rankingContext;
private final ResultValuator searchResultValuator;
private final CompiledQuery<String> compiledQuery;
private final CompiledQueryLong compiledQueryIds;
public IndexResultValuationContext(IndexMetadataService metadataService,
ResultValuator searchResultValuator,
CombinedDocIdList ids,
public IndexResultValuationContext(ResultValuator searchResultValuator,
StatefulIndex statefulIndex,
ResultRankingContext rankingContext,
SearchParameters params
) {
SearchParameters params)
{
this.index = statefulIndex.get();
this.rankingContext = rankingContext;
this.searchResultValuator = searchResultValuator;
this.queryParams = params.queryParams;
this.compiledQuery = params.compiledQuery;
this.compiledQueryIds = params.compiledQueryIds;
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids,
searchTerms.termIdsAll);
}
private final long flagsFilterMask =
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable
public SearchResultItem calculatePreliminaryScore(long combinedId) {
public SearchResultItem calculatePreliminaryScore(long combinedId,
QuerySearchTerms searchTerms,
long[] wordFlags,
GammaCodedSequence[] positions)
{
// FIXME: Reconsider coherence logic with the new position data
// if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
// return null;
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
int[] counts = new int[compiledQuery.size()];
for (int i = 0; i < counts.length; i++) {
if (positions[i] != null) {
counts[i] = positions[i].valueCount();
}
}
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
// If the document is not relevant to the query, abort early to reduce allocations and
// avoid unnecessary calculations
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
return null;
}
long docId = UrlIdCodec.removeRank(combinedId);
if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
return null;
long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = index.getHtmlFeatures(docId);
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,
htmlFeatures,
hasPrioTerm(combinedId));
long[] wordMetas = new long[compiledQuery.size()];
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
for (int i = 0; i < wordMetas.length; i++) {
final long termId = compiledQueryIds.at(i);
final String term = compiledQuery.at(i);
wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId);
scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]);
}
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
// a very flimsy assumption.
searchResult.keywordScores.addAll(List.of(scores));
CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
boolean allSynthetic = CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isPresent);
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask));
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta)));
if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) {
return null;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
return null;
int docSize = index.getDocumentSize(docId);
double score = searchResultValuator.calculateSearchResultValue(
wordMetasQuery,
wordFlagsQuery,
positionsCountQuery,
positionsQuery,
docMetadata,
htmlFeatures,
5000, // use a dummy value here as it's not present in the index
docSize,
rankingContext,
null);
if (searchResult.hasPrioTerm) {
SearchResultItem searchResult = new SearchResultItem(docId,
docMetadata,
htmlFeatures);
if (hasPrioTerm(searchTerms, positions)) {
score = 0.75 * score;
}
@ -124,13 +101,32 @@ public class IndexResultValuationContext {
return searchResult;
}
private boolean hasPrioTerm(long combinedId) {
for (var term : searchTerms.termIdsPrio.array()) {
if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) {
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
return true;
}
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
return true;
}
return false;
}
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
var allTerms = searchTerms.termIdsAll;
var prioTerms = searchTerms.termIdsPrio;
for (int i = 0; i < allTerms.size(); i++) {
if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
return true;
}
}
return false;
return false;
}
private boolean meetsQueryStrategyRequirements(CompiledQueryLong queryGraphScores,
@ -142,7 +138,7 @@ public class IndexResultValuationContext {
return true;
}
return CompiledQueryAggregates.booleanAggregate(queryGraphScores,
return booleanAggregate(queryGraphScores,
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
}

View File

@ -7,8 +7,6 @@ import gnu.trove.list.array.TLongArrayList;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
@ -21,12 +19,13 @@ import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.results.ResultValuator;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.lang.foreign.Arena;
import java.sql.SQLException;
import java.util.*;
import java.util.function.Consumer;
@Singleton
public class IndexResultValuatorService {
@ -53,35 +52,53 @@ public class IndexResultValuatorService {
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
{
final var evaluator = createValuationContext(params, rankingContext, resultIds);
IndexResultValuationContext evaluator =
new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params);
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
for (long id : resultIds.array()) {
var score = evaluator.calculatePreliminaryScore(id);
if (score != null) {
results.add(score);
try (var arena = Arena.ofConfined()) {
// Batch-fetch the word metadata for the documents
var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll);
// Prepare data for the document. We do this outside of the calculation function to avoid
// hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there;
// out here we can rely on implicit array ordering to match up the data.
var ra = resultIds.array();
long[] flags = new long[searchTerms.termIdsAll.size()];
GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()];
for (int i = 0; i < ra.length; i++) {
long id = ra[i];
// Prepare term-level data for the document
for (int ti = 0; ti < flags.length; ti++) {
long tid = searchTerms.termIdsAll.at(ti);
var tfd = termsForDocs.get(tid);
assert tfd != null : "No term data for term " + ti;
flags[ti] = tfd.flag(i);
positions[ti] = tfd.position(i);
}
// Calculate the preliminary score
var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions);
if (score != null) {
results.add(score);
}
}
return results;
}
return results;
}
private IndexResultValuationContext createValuationContext(SearchParameters params,
ResultRankingContext rankingContext,
CombinedDocIdList resultIds)
{
return new IndexResultValuationContext(metadataService,
resultValuator,
resultIds,
statefulIndex,
rankingContext,
params);
}
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
ResultRankingContext rankingContext,
Collection<SearchResultItem> results) throws SQLException {
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
@ -101,14 +118,13 @@ public class IndexResultValuatorService {
item.resultsFromDomain = domainCountFilter.getCount(item);
}
return decorateAndRerank(resultsList, params.compiledQuery, rankingContext);
return decorateResults(resultsList, params.compiledQuery);
}
/** Decorate the result items with additional information from the link database
* and calculate an updated ranking with the additional information */
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
CompiledQuery<String> compiledQuery,
ResultRankingContext rankingContext)
public List<DecoratedSearchResultItem> decorateResults(List<SearchResultItem> rawResults,
CompiledQuery<String> compiledQuery)
throws SQLException
{
TLongList idsList = new TLongArrayList(rawResults.size());
@ -131,42 +147,18 @@ public class IndexResultValuatorService {
continue;
}
// Reconstruct the compiledquery for re-valuation
//
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
// order as the data for the CompiledQuery<String>.
long[] wordMetas = new long[compiledQuery.size()];
for (int i = 0; i < compiledQuery.size(); i++) {
var score = result.keywordScores.get(i);
wordMetas[i] = score.encodedWordMetadata();
}
CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
resultItems.add(createCombinedItem(
result,
docData,
metaQuery,
rankingContext));
docData));
}
return resultItems;
}
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
DocdbUrlDetail docData,
CompiledQueryLong wordMetas,
ResultRankingContext rankingContext) {
DocdbUrlDetail docData) {
ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor();
Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
double score = resultValuator.calculateSearchResultValue(wordMetas,
result.encodedDocMetadata,
result.htmlFeatures,
docData.wordsTotal(),
rankingContext,
detailConsumer);
// Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
return new DecoratedSearchResultItem(
result,
@ -179,8 +171,8 @@ public class IndexResultValuatorService {
docData.pubYear(),
docData.dataHash(),
docData.wordsTotal(),
bestPositions(wordMetas),
score,
0L, //bestPositions(wordMetas),
result.getScore(),
detailsExtractor.get()
);
}

View File

@ -1,26 +1,38 @@
package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.index.results.model.ids.DocMetadataList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import nu.marginalia.index.results.model.ids.TermMetadataList;
import nu.marginalia.sequence.GammaCodedSequence;
import javax.annotation.Nullable;
public class TermMetadataForCombinedDocumentIds {
private static final Logger logger = LoggerFactory.getLogger(TermMetadataForCombinedDocumentIds.class);
private final Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta;
public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta) {
this.termdocToMeta = termdocToMeta;
}
public long getTermMetadata(long termId, long combinedId) {
public byte getTermMetadata(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
return 0;
}
return metaByCombinedId.get(combinedId);
return metaByCombinedId.get(combinedId).flags();
}
@Nullable
public GammaCodedSequence getPositions(long termId, long combinedId) {
var metaByCombinedId = termdocToMeta.get(termId);
if (metaByCombinedId == null) {
return null;
}
return metaByCombinedId.get(combinedId).positions();
}
public boolean hasTermMeta(long termId, long combinedId) {
@ -30,16 +42,25 @@ public class TermMetadataForCombinedDocumentIds {
return false;
}
return metaByCombinedId.get(combinedId) != 0;
return metaByCombinedId.data().containsKey(combinedId);
}
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));
public record DocumentsWithMetadata(Long2ObjectOpenHashMap<TermData> data) {
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) {
this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size()));
long[] ids = combinedDocIdsAll.array();
TermData[] data = metadata.array();
for (int i = 0; i < combinedDocIdsAll.size(); i++) {
if (data[i] != null) {
this.data.put(ids[i], data[i]);
}
}
}
public long get(long combinedId) {
return data.getOrDefault(combinedId, 0);
public TermData get(long combinedId) {
return data.get(combinedId);
}
}
}

View File

@ -15,6 +15,10 @@ import java.util.stream.LongStream;
public final class CombinedDocIdList {
private final long[] data;
public CombinedDocIdList(long... data) {
this.data = Arrays.copyOf(data, data.length);
}
public CombinedDocIdList(LongArrayList data) {
this.data = data.toLongArray();
}

View File

@ -1,45 +0,0 @@
package nu.marginalia.index.results.model.ids;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import java.util.Arrays;
import java.util.Objects;
import java.util.stream.LongStream;
public final class DocMetadataList {
private final long[] array;
public DocMetadataList(long[] array) {
this.array = array;
}
public DocMetadataList(LongArrayList list) {
this(list.toLongArray());
}
public int size() {
return array.length;
}
public LongStream stream() {
return LongStream.of(array);
}
public long[] array() {
return array;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (DocMetadataList) obj;
return Arrays.equals(this.array, that.array);
}
@Override
public int hashCode() {
return Arrays.hashCode(array);
}
}

View File

@ -11,6 +11,7 @@ public final class TermIdList {
public TermIdList(long[] array) {
this.array = array;
Arrays.sort(this.array);
}
public TermIdList(LongArrayList list) {
@ -29,6 +30,15 @@ public final class TermIdList {
return array;
}
public long at(int i) {
return array[i];
}
public boolean contains(long id) {
// Implicitly sorted
return Arrays.binarySearch(array, id) >= 0;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;

View File

@ -0,0 +1,55 @@
package nu.marginalia.index.results.model.ids;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.sequence.GammaCodedSequence;
import javax.annotation.Nullable;
import java.util.Arrays;
public final class TermMetadataList {
private final TermData[] array;
public TermMetadataList(TermData[] array) {
this.array = array;
}
public int size() {
return array.length;
}
public long flag(int i) {
if (array[i] == null)
return 0;
return array[i].flags();
}
/** Returns the position data for the given document index,
* may be null if the term is not in the document
*/
@Nullable
public GammaCodedSequence position(int i) {
if (array[i] == null)
return null;
return array[i].positions();
}
public TermData[] array() {
return array;
}
@Override
public boolean equals(Object obj) {
if (obj == this) return true;
if (obj == null || obj.getClass() != this.getClass()) return false;
var that = (TermMetadataList) obj;
return Arrays.equals(this.array, that.array);
}
@Override
public int hashCode() {
return Arrays.hashCode(array);
}
}

View File

@ -1,5 +1,7 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
@ -14,6 +16,7 @@ import nu.marginalia.ranking.results.factors.*;
import com.google.inject.Inject;
import com.google.inject.Singleton;
import nu.marginalia.sequence.GammaCodedSequence;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -33,15 +36,15 @@ public class ResultValuator {
this.termCoherenceFactor = termCoherenceFactor;
}
public double calculateSearchResultValue(CompiledQueryLong wordMeta,
long documentMetadata,
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
int features,
int length,
ResultRankingContext ctx,
@Nullable Consumer<ResultRankingDetails> detailsConsumer
)
{
if (wordMeta.isEmpty())
if (wordFlagsQuery.isEmpty())
return Double.MAX_VALUE;
if (length < 0) {
@ -82,12 +85,11 @@ public class ResultValuator {
+ temporalBias
+ flagsPenalty;
double tcfOverlap = rankingParams.tcfOverlapWeight * termCoherenceFactor.calculateOverlap(wordMeta);
double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
// FIXME: need a weighting factor here
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
double bM25N = rankingParams.bm25NgramWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx));
double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx));
double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx));
double overallPartPositive = Math.max(0, overallPart);
double overallPartNegative = -Math.min(0, overallPart);
@ -112,10 +114,10 @@ public class ResultValuator {
temporalBias,
flagsPenalty,
overallPart,
tcfOverlap,
tcfJaccard,
0,
0,
bM25F,
bM25N,
0, // FIXME: Remove from model
bM25P)
);
@ -125,8 +127,8 @@ public class ResultValuator {
// Renormalize to 0...15, where 0 is the best possible score;
// this is a historical artifact of the original ranking function
double ret = normalize(
tcfOverlap + tcfJaccard
+ bM25F + bM25P + bM25N
tcfAvgDist
+ bM25F + bM25P
+ overallPartPositive,
overallPartNegative);

View File

@ -13,7 +13,7 @@ import java.util.List;
public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
private static final long AVG_LENGTH = 5000;
private final CqDataLong wordMetaData;
private final CqDataInt counts;
private final CqDataInt frequencies;
private final Bm25Parameters bm25Parameters;
@ -22,31 +22,16 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
private final BitSet mask;
private Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
CqDataInt counts,
int length,
BitSet mask,
ResultRankingContext ctx) {
this.length = length;
this.bm25Parameters = bm25Parameters;
this.docCount = ctx.termFreqDocCount();
this.wordMetaData = wordMetaData;
this.counts = counts;
this.frequencies = ctx.fullCounts;
this.mask = mask;
}
public static Bm25FullGraphVisitor forRegular(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
int length,
ResultRankingContext ctx) {
return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.regularMask, ctx);
}
public static Bm25FullGraphVisitor forNgrams(Bm25Parameters bm25Parameters,
CqDataLong wordMetaData,
int length,
ResultRankingContext ctx) {
return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.ngramsMask, ctx);
this.mask = ctx.regularMask;
}
@Override
@ -73,7 +58,7 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
return 0;
}
double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx)));
double count = counts.get(idx);
int freq = frequencies.get(idx);

View File

@ -1,66 +1,44 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.sequence.SequenceOperations;
/** Rewards documents where terms appear frequently within the same sentences
*/
public class TermCoherenceFactor {
/** Calculate a factor that rewards the best total position overlap
* between the terms in the query. This is high when all the terms
* found in the same sentences.
*/
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
if (wordMetadataQuery.size() < 2)
return 0;
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
score -> score >>> WordMetadata.POSITIONS_SHIFT);
return bitsSetFactor(mask);
}
/** Calculate a factor that rewards the best average mutual Jaccard index
* between the terms in the query. This is high when the several terms are frequently
* found in the same sentences.
*/
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
public double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
double sum = 0;
int cnt = 0;
for (int i = 0; i < wordMetadataQuery.size(); i++) {
for (int i = 0; i < positions.size(); i++) {
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(i))
continue;
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
var posi = positions.at(i);
// Skip terms that are not in the document
if (imask == 0L)
if (posi == null)
continue;
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
for (int j = i + 1; j < positions.size(); j++) {
// Skip terms that are not in the regular mask
if (!ctx.regularMask.get(j))
continue;
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
var posj = positions.at(j);
// Skip terms that are not in the document
if (jmask == 0L)
if (posj == null)
continue;
long quot = Long.bitCount(imask & jmask);
long rem = Long.bitCount(imask | jmask);
// rem is always > 0 because imask and jmask are not both 0
sum += quot/(double) rem;
int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
sum += distance;
cnt++;
}
}
@ -68,15 +46,8 @@ public class TermCoherenceFactor {
if (cnt > 0) {
return sum / cnt;
} else {
return 0;
return 1000.;
}
}
double bitsSetFactor(long mask) {
final int bitsSetInMask = Long.bitCount(mask);
return Math.pow(bitsSetInMask/(double) WordMetadata.POSITIONS_COUNT, 0.25);
}
}

View File

@ -0,0 +1,382 @@
package nu.marginalia.index;
import com.google.inject.Guice;
import com.google.inject.Inject;
import it.unimi.dsi.fastutil.ints.IntList;
import it.unimi.dsi.fastutil.longs.LongArrayList;
import it.unimi.dsi.fastutil.longs.LongList;
import nu.marginalia.IndexLocations;
import nu.marginalia.array.page.LongQueryBuffer;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.reader.IndexJournalReader;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.index.positions.TermData;
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
import nu.marginalia.linkdb.docs.DocumentDbReader;
import nu.marginalia.linkdb.docs.DocumentDbWriter;
import nu.marginalia.linkdb.model.DocdbUrlDetail;
import nu.marginalia.model.EdgeUrl;
import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.service.control.ServiceHeartbeat;
import nu.marginalia.service.server.Initialization;
import nu.marginalia.storage.FileStorageService;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.parallel.Execution;
import java.io.IOException;
import java.lang.foreign.Arena;
import java.net.URISyntaxException;
import java.nio.ByteBuffer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.sql.SQLException;
import java.util.*;
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
@Execution(SAME_THREAD)
public class CombinedIndexReaderTest {
@Inject
Initialization initialization;
IndexQueryServiceIntegrationTestModule testModule;
@Inject
StatefulIndex statefulIndex;
@Inject
IndexJournalWriter indexJournalWriter;
@Inject
FileStorageService fileStorageService;
@Inject
DomainRankings domainRankings;
@Inject
ProcessHeartbeat processHeartbeat;
@Inject
DocumentDbReader documentDbReader;
@Inject
IndexFactory indexFactory;
@BeforeEach
public void setUp() throws IOException {
testModule = new IndexQueryServiceIntegrationTestModule();
Guice.createInjector(testModule).injectMembers(this);
initialization.setReady();
}
@AfterEach
public void tearDown() throws IOException {
testModule.cleanUp();
}
private final MockDocumentMeta anyMetadata = new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class)));
@Test
public void testSimpleRetrieval() throws Exception {
new MockData().add(
d(1, 1),
anyMetadata,
w("hello", WordFlags.Title, 33, 55),
w("world", WordFlags.Subjects, 34)
).load();
var reader = indexFactory.getCombinedIndexReader();
var query = reader.findFullWord(kw("hello")).build();
var buffer = new LongQueryBuffer(32);
query.getMoreResults(buffer);
assertEquals(
List.of(d(1, 1)),
decode(buffer)
);
var helloMeta = td(reader, kw("hello"), d(1, 1));
assertEquals(helloMeta.flags(), WordFlags.Title.asBit());
assertEquals(IntList.of(33, 55), helloMeta.positions().values());
var worldMeta = td(reader, kw("world"), d(1, 1));
assertEquals(worldMeta.flags(), WordFlags.Subjects.asBit());
assertEquals(IntList.of(34), worldMeta.positions().values());
}
TermData td(CombinedIndexReader reader, long wordId, MockDataDocument docId) {
return (reader.getTermMetadata(Arena.global(), wordId, new CombinedDocIdList(docId.docId())).array())[0];
}
@Test
public void testUnionRetrieval() throws Exception {
new MockData()
.add(
d(1, 1),
anyMetadata,
w("hello", WordFlags.Title),
w("world", WordFlags.Title)
)
.add(
d(1, 2),
anyMetadata,
w("world", WordFlags.Title)
)
.add(
d(1, 3),
anyMetadata,
w("world", WordFlags.Title)
)
.add(
d(2, 4),
anyMetadata,
w("hello", WordFlags.Title),
w("world", WordFlags.Title)
)
.load();
var reader = indexFactory.getCombinedIndexReader();
var query = reader
.findFullWord(kw("hello"))
.also(kw("world"))
.build();
var buffer = new LongQueryBuffer(32);
query.getMoreResults(buffer);
assertEquals(
List.of(d(1, 1), d(2, 4)),
decode(buffer)
);
}
@Test
public void testNotFilterRetrieval() throws Exception {
new MockData()
.add(
d(1, 1),
anyMetadata,
w("hello", WordFlags.Title),
w("world", WordFlags.Title),
w("goodbye", WordFlags.Title)
)
.add(
d(1, 2),
anyMetadata,
w("world", WordFlags.Title)
)
.add(
d(1, 3),
anyMetadata,
w("world", WordFlags.Title)
)
.add(
d(2, 4),
anyMetadata,
w("hello", WordFlags.Title),
w("world", WordFlags.Title)
)
.load();
var reader = indexFactory.getCombinedIndexReader();
var query = reader.findFullWord(kw("hello"))
.also(kw("world"))
.not(kw("goodbye"))
.build();
var buffer = new LongQueryBuffer(32);
query.getMoreResults(buffer);
assertEquals(
List.of(d(2, 4)),
decode(buffer)
);
}
List<MockDataDocument> decode(LongQueryBuffer buffer) {
List<MockDataDocument> result = new ArrayList<>();
for (int i = 0; i < buffer.size(); i++) {
result.add(new MockDataDocument(buffer.data.get(i)));
}
return result;
}
private MockDataDocument d(int domainId, int ordinal) {
return new MockDataDocument(domainId, ordinal);
}
private void constructIndex() throws IOException {
createForwardIndex();
createFullReverseIndex();
createPrioReverseIndex();
}
private void createFullReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor =
new ReverseIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
}
private void createPrioReverseIndex() throws IOException {
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path outputFilePositions = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.POSITIONS, ReverseIndexPrioFileNames.FileVersion.NEXT);
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path tmpDir = workDir.resolve("tmp");
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
var constructor = new ReverseIndexConstructor(
outputFileDocs,
outputFileWords,
outputFilePositions,
IndexJournalReader::singleFile,
DocIdRewriter.identity(),
tmpDir);
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
}
private void createForwardIndex() throws IOException {
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
IndexJournalReader.paging(workDir),
outputFileDocsId,
outputFileDocsData,
domainRankings
);
converter.convert();
}
MurmurHash3_128 hasher = new MurmurHash3_128();
long kw(String s) {
return hasher.hashKeyword(s);
}
class MockData {
private final Map<Long, List<MockDataKeyword>> allData = new HashMap<>();
private final Map<Long, MockDocumentMeta> metaByDoc = new HashMap<>();
public MockData add(MockDataDocument document,
MockDocumentMeta meta,
MockDataKeyword... words)
{
long id = UrlIdCodec.encodeId(document.domainId, document.ordinal);
allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words));
metaByDoc.put(id, meta);
return this;
}
void load() throws IOException, SQLException, URISyntaxException {
allData.forEach((doc, words) -> {
var meta = metaByDoc.get(doc);
var header = new IndexJournalEntryHeader(
doc,
meta.features,
100,
meta.documentMetadata.encode()
);
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new);
indexJournalWriter.put(header,
new IndexJournalEntryData(keywords, metadata, positions));
});
var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
);
for (Long key : allData.keySet()) {
linkdbWriter.add(new DocdbUrlDetail(
key,
new EdgeUrl("https://www.example.com"),
"test",
"test",
0.,
"HTML5",
0,
null,
0,
5
));
}
linkdbWriter.close();
indexJournalWriter.close();
constructIndex();
documentDbReader.reconnect();
statefulIndex.switchIndex();
}
}
record MockDataDocument(int domainId, int ordinal) {
public MockDataDocument(long encodedId) {
this(UrlIdCodec.getDomainId(encodedId), UrlIdCodec.getDocumentOrdinal(encodedId));
}
public long docId() {
return UrlIdCodec.encodeId(domainId, ordinal);
}
}
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {}
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {}
MockDataKeyword w(String keyword, WordFlags flags, int... positions) {
return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions));
}
}

View File

@ -13,7 +13,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
import nu.marginalia.process.control.ProcessHeartbeat;
import nu.marginalia.sequence.GammaCodedSequence;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.ReverseIndexConstructor;
import nu.marginalia.index.forward.ForwardIndexConverter;
@ -142,6 +141,53 @@ public class IndexQueryServiceIntegrationSmokeTest {
Assertions.assertArrayEquals(ids, actual);
}
@Test
public void testSimple() throws Exception {
var linkdbWriter = new DocumentDbWriter(
IndexLocations.getLinkdbLivePath(fileStorageService)
.resolve(DOCDB_FILE_NAME)
);
for (int i = 1; i < 512; i++) {
loadData(linkdbWriter, i);
}
linkdbWriter.close();
documentDbReader.reconnect();
indexJournalWriter.close();
constructIndex();
statefulIndex.switchIndex();
var rsp = queryService.justQuery(
SearchSpecification.builder()
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
.queryStrategy(QueryStrategy.SENTENCE)
.year(SpecificationLimit.none())
.quality(SpecificationLimit.none())
.size(SpecificationLimit.none())
.rank(SpecificationLimit.none())
.rankingParams(ResultRankingParameters.sensibleDefaults())
.domains(new ArrayList<>())
.searchSetIdentifier("NONE")
.query(
SearchQuery.builder("2")
.include("2")
.build()
).build()
);
int[] idxes = new int[] { 62, 222, 382, 60, 124, 220, 284, 380, 444, 122 };
long[] ids = IntStream.of(idxes).mapToLong(Long::valueOf).toArray();
long[] actual = rsp.results
.stream()
.mapToLong(i -> i.rawIndexResult.getDocumentId())
.map(UrlIdCodec::getDocumentOrdinal)
.toArray();
System.out.println(Arrays.toString(actual));
System.out.println(Arrays.toString(ids));
Assertions.assertArrayEquals(ids, actual);
}
@Test
public void testDomainQuery() throws Exception {
@ -297,7 +343,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
return UrlIdCodec.encodeId((32 - (id % 32)), id);
}
MurmurHash3_128 hasher = new MurmurHash3_128();
@SneakyThrows
public void loadData(DocumentDbWriter ldbw, int id) {
int[] factors = IntStream
@ -305,22 +350,44 @@ public class IndexQueryServiceIntegrationSmokeTest {
.filter(v -> (id % v) == 0)
.toArray();
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
long fullId = fullId(id);
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
long[] data = new long[factors.length * 2];
for (int i = 0; i < factors.length; i++) {
data[2 * i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(32);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, factors);
}
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
@SneakyThrows
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
long fullId = UrlIdCodec.encodeId(domain, id);
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue());
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
@ -334,30 +401,4 @@ public class IndexQueryServiceIntegrationSmokeTest {
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
@SneakyThrows
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
long fullId = UrlIdCodec.encodeId(domain, id);
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue());
ldbw.add(new DocdbUrlDetail(
fullId, new EdgeUrl("https://www.example.com/"+id),
"test", "test", 0., "HTML5", 0, null, 0, 10
));
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
long[] metadata = new long[factors.length];
for (int i = 0; i < factors.length; i++) {
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
}
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
ByteBuffer wa = ByteBuffer.allocate(16);
for (int i = 0; i < factors.length; i++) {
positions[i] = GammaCodedSequence.generate(wa, i);
}
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
}
}

View File

@ -565,6 +565,7 @@ public class IndexQueryServiceIntegrationTest {
var header = new IndexJournalEntryHeader(
doc,
meta.features,
100,
meta.documentMetadata.encode()
);

View File

@ -1,100 +0,0 @@
package nu.marginalia.ranking.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.model.idx.DocumentFlags;
import nu.marginalia.model.idx.WordFlags;
import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.model.idx.WordMetadata;
import nu.marginalia.ranking.results.factors.*;
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.mockito.Mockito;
import java.util.*;
import static org.mockito.Mockito.when;
class ResultValuatorTest {
TermFrequencyDict dict;
ResultValuator valuator;
@BeforeEach
public void setUp() {
dict = Mockito.mock(TermFrequencyDict.class);
when(dict.docCount()).thenReturn(100_000);
valuator = new ResultValuator(
new TermCoherenceFactor()
);
}
CqDataInt frequencyData = new CqDataInt(new int[] { 10 });
CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)))
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);
CompiledQueryLong highCountNoTitleSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)))
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
CompiledQueryLong highCountSubjectSet = CompiledQuery.just(
new SearchResultKeywordScore("bob", 1,
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)))
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
@Test
void evaluateTerms() {
when(dict.getTermFreq("bob")).thenReturn(10);
ResultRankingContext context = new ResultRankingContext(100000,
ResultRankingParameters.sensibleDefaults(),
new BitSet(),
new BitSet(),
frequencyData,
frequencyData);
long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class));
int features = 0;
double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null);
double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null);
double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context, null);
double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context, null);
System.out.println(titleOnlyLowCount);
System.out.println(titleLongOnlyLowCount);
System.out.println(highCountNoTitle);
System.out.println(highCountSubject);
}
private long docMetadata(int topology,
int year,
int quality,
EnumSet<DocumentFlags> flags) {
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
}
private long wordMetadata(Set<Integer> positions, Set<WordFlags> wordFlags) {
long posBits = positions.stream()
.mapToLong(i -> ((1L << i) & 0xFF_FFFF_FFFF_FFFFL))
.reduce((a,b) -> a|b)
.orElse(0L);
return new WordMetadata(posBits, wordFlags).encode();
}
}

View File

@ -1,107 +0,0 @@
package nu.marginalia.ranking.results.factors;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
import nu.marginalia.bbpc.BrailleBlockPunchCards;
import nu.marginalia.model.idx.WordMetadata;
import org.junit.jupiter.api.Test;
import java.util.ArrayList;
import java.util.List;
import static org.junit.jupiter.api.Assertions.*;
class TermCoherenceFactorTest {
TermCoherenceFactor termCoherenceFactor = new TermCoherenceFactor();
@Test
public void testAllBitsSet() {
var allPositionsSet = createSet(
~0L,
~0L
);
long mask = CompiledQueryAggregates.longBitmaskAggregate(
allPositionsSet,
SearchResultKeywordScore::positions
);
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
assertEquals(1.0,
termCoherenceFactor.calculateOverlap(
allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)
)
);
}
@Test
public void testNoBitsSet() {
var allPositionsSet = createSet(
0, 0
);
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
assertEquals(0, termCoherenceFactor.calculateOverlap(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)));
}
@Test @SuppressWarnings("unchecked")
public void testLowPosMatches() {
var positions = createSet(
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
);
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
printMask(mask);
}
@Test @SuppressWarnings("unchecked")
public void testHiPosMatches() {
var positions = createSet(
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
);
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
printMask(mask);
}
@Test
public void testBitMatchScaling() {
for (int i = 1; i < 48; i++) {
System.out.println(i + ":" + termCoherenceFactor.bitsSetFactor((1L << i) - 1));
}
}
void printMask(long mask) {
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
}
CompiledQuery<SearchResultKeywordScore> createSet(List<Integer>... maskPositions) {
long[] positions = new long[maskPositions.length];
for (int i = 0; i < maskPositions.length; i++) {
for (long pos : maskPositions[i]) {
positions[i] |= (1L<<pos);
}
}
return createSet(positions);
}
CompiledQuery<SearchResultKeywordScore> createSet(long... positionMasks) {
List<SearchResultKeywordScore> keywords = new ArrayList<>();
for (int i = 0; i < positionMasks.length; i++) {
keywords.add(new SearchResultKeywordScore("", 0,
new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode()));
}
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));
}
}

View File

@ -17,12 +17,13 @@ public class EliasGammaCodec implements IntIterator {
private final BitReader reader;
int rem = 0;
private int last = 0;
private int last;
private int next = 0;
private EliasGammaCodec(ByteBuffer buffer) {
private EliasGammaCodec(ByteBuffer buffer, int zero) {
reader = new BitReader(buffer);
last = zero;
int bits = reader.takeWhileZero();
if (!reader.hasMore()) {
@ -33,9 +34,24 @@ public class EliasGammaCodec implements IntIterator {
}
}
public static int readCount(ByteBuffer buffer) {
var reader = new BitReader(buffer);
if (reader.getCurrentValue() > 0) {
int bits = reader.takeWhileZero();
return reader.get(bits);
}
else {
return 0;
}
}
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
public static IntIterator decode(ByteBuffer buffer) {
return new EliasGammaCodec(buffer);
return new EliasGammaCodec(buffer, 0);
}
public static IntIterator decodeWithOffset(ByteBuffer buffer, int offset) {
return new EliasGammaCodec(buffer, offset);
}
/** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code.

View File

@ -16,6 +16,7 @@ import java.util.StringJoiner;
* */
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
private final ByteBuffer raw;
int startPos = 0;
int startLimit = 0;
@ -43,6 +44,12 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
startLimit = bytes.limit();
}
public GammaCodedSequence(ByteBuffer bytes, int startPos, int startLimit) {
this.raw = bytes;
this.startPos = startPos;
this.startLimit = startLimit;
}
public GammaCodedSequence(byte[] bytes) {
raw = ByteBuffer.allocate(bytes.length);
raw.put(bytes);
@ -72,6 +79,18 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
return EliasGammaCodec.decode(raw);
}
/** Return an iterator over the sequence with a constant offset applied to each value.
* This is useful for comparing sequences with different offsets, and adds zero
* extra cost to the decoding process which is already based on adding
* relative differences.
* */
public IntIterator offsetIterator(int offset) {
raw.position(startPos);
raw.limit(startLimit);
return EliasGammaCodec.decodeWithOffset(raw, offset);
}
public IntList values() {
var intItr = iterator();
IntArrayList ret = new IntArrayList(8);
@ -81,18 +100,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
return ret;
}
/** Decode the sequence into an IntList;
* this is a somewhat slow operation,
* iterating over the data directly more performant */
public IntList decode() {
IntArrayList ret = new IntArrayList(8);
var iter = iterator();
while (iter.hasNext()) {
ret.add(iter.nextInt());
}
return ret;
}
public int hashCode() {
return raw.hashCode();
}
@ -116,7 +123,11 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
return raw;
}
public int size() {
public int bufferSize() {
return raw.capacity();
}
public int valueCount() {
return EliasGammaCodec.readCount(buffer());
}
}

View File

@ -0,0 +1,86 @@
package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntIterator;
public class SequenceOperations {
/** Return true if the sequences intersect, false otherwise.
* */
public static boolean intersectSequences(IntIterator... sequences) {
if (sequences.length <= 1)
return true;
// Initialize values and find the maximum value
int[] values = new int[sequences.length];
for (int i = 0; i < sequences.length; i++) {
if (sequences[i].hasNext())
values[i] = sequences[i].nextInt();
else
return false;
}
// Intersect the sequences by advancing all values smaller than the maximum seen so far
// until they are equal to the maximum value, or until the end of the sequence is reached
int max = Integer.MIN_VALUE;
int successes = 0;
for (int i = 0; successes < sequences.length; i = (i + 1) % sequences.length)
{
if (values[i] == max) {
successes++;
} else {
successes = 0;
// Discard values until we reach the maximum value seen so far,
// or until the end of the sequence is reached
while (values[i] < max) {
if (sequences[i].hasNext())
values[i] = sequences[i].nextInt();
else
return false;
}
// Update the maximum value, if necessary
max = Math.max(max, values[i]);
}
}
return true;
}
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
* */
public static int minDistance(IntIterator seqA, IntIterator seqB)
{
int minDistance = Integer.MAX_VALUE;
if (!seqA.hasNext() || !seqB.hasNext())
return -1;
int a = seqA.nextInt();
int b = seqB.nextInt();
while (true) {
int distance = Math.abs(a - b);
if (distance < minDistance)
minDistance = distance;
if (a <= b) {
if (seqA.hasNext()) {
a = seqA.nextInt();
} else {
break;
}
} else {
if (seqB.hasNext()) {
b = seqB.nextInt();
} else {
break;
}
}
}
return minDistance;
}
}

View File

@ -20,6 +20,10 @@ public class BitReader {
this.currentValue = 0;
}
public long getCurrentValue() {
return currentValue;
}
/** Read the next bit from the buffer */
public boolean getBit() {
if (bitPosition <= 0) {

View File

@ -0,0 +1,75 @@
package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntIterator;
import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer;
import static org.junit.jupiter.api.Assertions.*;
class SequenceOperationsTest {
@Test
void intersectSequencesSingle() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
assertTrue(SequenceOperations.intersectSequences(seq1.iterator()));
}
@Test
void intersectSequencesTrivialMatch() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 1);
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
}
@Test
void intersectSequencesTrivialMismatch() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2);
assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
}
@Test
void intersectSequencesOffsetMatch() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 3);
assertTrue(SequenceOperations.intersectSequences(seq1.offsetIterator(0), seq2.offsetIterator(-2)));
}
@Test
void intersectSequencesDeepMatch() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14);
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
}
@Test
void intersectSequencesDeepMatch3() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14);
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9);
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
}
@Test
void intersectSequencesDeepMismatch() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 14);
assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
}
}

View File

@ -26,6 +26,8 @@ public class DocumentRecordKeywordsProjection {
public int htmlFeatures;
public long documentMetadata;
public int length;
public List<String> words;
public TLongList metas;
public List<GammaCodedSequence> positions;
@ -39,13 +41,14 @@ public class DocumentRecordKeywordsProjection {
}
public static Collection<String> requiredColumns() {
return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata");
return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata", "length");
}
@SneakyThrows
public DocumentRecordKeywordsProjection add(String heading, Object value) {
switch (heading) {
case "domain" -> domain = (String) value;
case "length" -> length = (Integer) value;
case "ordinal" -> ordinal = (Integer) value;
case "htmlFeatures" -> htmlFeatures = (Integer) value;
case "documentMetadata" -> documentMetadata = (Long) value;

View File

@ -6,12 +6,10 @@ import lombok.SneakyThrows;
import nu.marginalia.IndexLocations;
import nu.marginalia.index.journal.model.IndexJournalEntryData;
import nu.marginalia.storage.FileStorageService;
import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
import nu.marginalia.index.journal.writer.IndexJournalWriter;
import nu.marginalia.keyword.model.DocumentKeywords;
import nu.marginalia.model.idx.DocumentMetadata;
import nu.marginalia.index.journal.IndexJournalFileNames;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -41,18 +39,11 @@ public class LoaderIndexJournalWriter {
indexWriter = new IndexJournalWriterPagingImpl(indexArea);
}
public void putWords(long combinedId,
int features,
DocumentMetadata metadata,
DocumentKeywords wordSet) {
putWords(combinedId, features, metadata.encode(), wordSet);
}
@SneakyThrows
public void putWords(long combinedId,
int features,
long metadata,
int length,
DocumentKeywords wordSet) {
if (wordSet.isEmpty()) {
@ -65,7 +56,7 @@ public class LoaderIndexJournalWriter {
return;
}
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
var header = new IndexJournalEntryHeader(combinedId, features, length, metadata);
var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions);
indexWriter.put(header, data);

View File

@ -75,6 +75,7 @@ public class KeywordLoaderService {
writer.putWords(combinedId,
projection.htmlFeatures,
projection.documentMetadata,
projection.length,
words);
}
}

View File

@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
long positions)
{
results.add(new DecoratedSearchResultItem(
new SearchResultItem(url.hashCode(), 2, 3, false),
new SearchResultItem(url.hashCode(), 2, 3),
new EdgeUrl(url),
title,
description,