mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-24 05:18:58 +00:00
(index) Integrate positions data with indexes WIP
This change integrates the new positions data with the forward and reverse indexes. The ranking code is still only partially re-written.
This commit is contained in:
parent
9f982a0c3d
commit
36160988e2
@ -5,8 +5,8 @@ import java.util.stream.IntStream;
|
|||||||
|
|
||||||
/** A compiled index service query */
|
/** A compiled index service query */
|
||||||
public class CompiledQueryInt {
|
public class CompiledQueryInt {
|
||||||
private final CqExpression root;
|
public final CqExpression root;
|
||||||
private final CqDataInt data;
|
public final CqDataInt data;
|
||||||
|
|
||||||
public CompiledQueryInt(CqExpression root, CqDataInt data) {
|
public CompiledQueryInt(CqExpression root, CqDataInt data) {
|
||||||
this.root = root;
|
this.root = root;
|
||||||
@ -26,7 +26,7 @@ public class CompiledQueryInt {
|
|||||||
return IntStream.range(0, data.size());
|
return IntStream.range(0, data.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
public long at(int index) {
|
public int at(int index) {
|
||||||
return data.get(index);
|
return data.get(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,7 +61,8 @@ public class CompiledQueryParser {
|
|||||||
|
|
||||||
String[] cqData = new String[wordIds.size()];
|
String[] cqData = new String[wordIds.size()];
|
||||||
wordIds.forEach((w, i) -> cqData[i] = w);
|
wordIds.forEach((w, i) -> cqData[i] = w);
|
||||||
return new CompiledQuery<>(root, new CqData<>(cqData));
|
|
||||||
|
return root.newQuery(cqData);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -8,6 +8,18 @@ import java.util.stream.Stream;
|
|||||||
*
|
*
|
||||||
*/
|
*/
|
||||||
public sealed interface CqExpression {
|
public sealed interface CqExpression {
|
||||||
|
/** Create a new query for the provided data using this expression as the root */
|
||||||
|
default <T> CompiledQuery<T> newQuery(T[] data) {
|
||||||
|
return new CompiledQuery<>(this, data);
|
||||||
|
}
|
||||||
|
/** Create a new query for the provided data using this expression as the root */
|
||||||
|
default CompiledQueryInt newQuery(int[] data) {
|
||||||
|
return new CompiledQueryInt(this, new CqDataInt(data));
|
||||||
|
}
|
||||||
|
/** Create a new query for the provided data using this expression as the root */
|
||||||
|
default CompiledQueryLong newQuery(long[] data) {
|
||||||
|
return new CompiledQueryLong(this, new CqDataLong(data));
|
||||||
|
}
|
||||||
|
|
||||||
Stream<Word> stream();
|
Stream<Word> stream();
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@ package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
|||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -36,7 +37,10 @@ public class CompiledQueryAggregates {
|
|||||||
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
public static <T> int intMaxMinAggregate(CompiledQuery<T> query, ToIntFunction<T> operator) {
|
||||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||||
}
|
}
|
||||||
|
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||||
|
public static <T> int intMaxMinAggregate(CompiledQueryInt query, IntUnaryOperator operator) {
|
||||||
|
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||||
|
}
|
||||||
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
/** Apply the operator to each leaf node, then return the highest minimum value found along any path */
|
||||||
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
|
public static int intMaxMinAggregate(CompiledQueryLong query, LongToIntFunction operator) {
|
||||||
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
return query.root.visit(new CqIntMaxMinOperator(query, operator));
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||||
|
|
||||||
@ -21,7 +22,9 @@ public class CqIntMaxMinOperator implements CqExpression.IntVisitor {
|
|||||||
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
|
public CqIntMaxMinOperator(CompiledQueryLong query, LongToIntFunction operator) {
|
||||||
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||||
}
|
}
|
||||||
|
public CqIntMaxMinOperator(CompiledQueryInt query, IntUnaryOperator operator) {
|
||||||
|
this.operator = idx -> operator.applyAsInt(query.at(idx));
|
||||||
|
}
|
||||||
@Override
|
@Override
|
||||||
public int onAnd(List<? extends CqExpression> parts) {
|
public int onAnd(List<? extends CqExpression> parts) {
|
||||||
int value = parts.getFirst().visit(this);
|
int value = parts.getFirst().visit(this);
|
||||||
|
@ -36,6 +36,10 @@ public class SearchQuery {
|
|||||||
@Deprecated // why does this exist?
|
@Deprecated // why does this exist?
|
||||||
private double value = 0;
|
private double value = 0;
|
||||||
|
|
||||||
|
public static SearchQueryBuilder builder(String compiledQuery) {
|
||||||
|
return new SearchQueryBuilder(compiledQuery);
|
||||||
|
}
|
||||||
|
|
||||||
public SearchQuery() {
|
public SearchQuery() {
|
||||||
this.compiledQuery = "";
|
this.compiledQuery = "";
|
||||||
this.searchTermsInclude = new ArrayList<>();
|
this.searchTermsInclude = new ArrayList<>();
|
||||||
@ -81,5 +85,45 @@ public class SearchQuery {
|
|||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static class SearchQueryBuilder {
|
||||||
|
private final String compiledQuery;
|
||||||
|
private List<String> searchTermsInclude = new ArrayList<>();
|
||||||
|
private List<String> searchTermsExclude = new ArrayList<>();
|
||||||
|
private List<String> searchTermsAdvice = new ArrayList<>();
|
||||||
|
private List<String> searchTermsPriority = new ArrayList<>();
|
||||||
|
private List<List<String>> searchTermCoherences = new ArrayList<>();
|
||||||
|
|
||||||
|
private SearchQueryBuilder(String compiledQuery) {
|
||||||
|
this.compiledQuery = compiledQuery;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchQueryBuilder include(String... terms) {
|
||||||
|
searchTermsInclude.addAll(List.of(terms));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchQueryBuilder exclude(String... terms) {
|
||||||
|
searchTermsExclude.addAll(List.of(terms));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchQueryBuilder advice(String... terms) {
|
||||||
|
searchTermsAdvice.addAll(List.of(terms));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchQueryBuilder priority(String... terms) {
|
||||||
|
searchTermsPriority.addAll(List.of(terms));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchQueryBuilder coherences(String... coherences) {
|
||||||
|
searchTermCoherences.add(List.of(coherences));
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
public SearchQuery build() {
|
||||||
|
return new SearchQuery(compiledQuery, searchTermsInclude, searchTermsExclude, searchTermsAdvice, searchTermsPriority, searchTermCoherences);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -32,13 +32,11 @@ public class SearchResultItem implements Comparable<SearchResultItem> {
|
|||||||
|
|
||||||
public SearchResultItem(long combinedId,
|
public SearchResultItem(long combinedId,
|
||||||
long encodedDocMetadata,
|
long encodedDocMetadata,
|
||||||
int htmlFeatures,
|
int htmlFeatures) {
|
||||||
boolean hasPrioTerm) {
|
|
||||||
this.combinedId = combinedId;
|
this.combinedId = combinedId;
|
||||||
this.encodedDocMetadata = encodedDocMetadata;
|
this.encodedDocMetadata = encodedDocMetadata;
|
||||||
this.keywordScores = new ArrayList<>();
|
this.keywordScores = new ArrayList<>();
|
||||||
this.htmlFeatures = htmlFeatures;
|
this.htmlFeatures = htmlFeatures;
|
||||||
this.hasPrioTerm = hasPrioTerm;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -83,8 +83,10 @@ public class ForwardIndexConverter {
|
|||||||
int ranking = domainRankings.getRanking(domainId);
|
int ranking = domainRankings.getRanking(domainId);
|
||||||
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
|
long meta = DocumentMetadata.encodeRank(pointer.documentMeta(), ranking);
|
||||||
|
|
||||||
|
long features = pointer.documentFeatures() | ((long) pointer.documentSize() << 32L);
|
||||||
|
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
docFileData.set(entryOffset + ForwardIndexParameters.METADATA_OFFSET, meta);
|
||||||
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, pointer.documentFeatures());
|
docFileData.set(entryOffset + ForwardIndexParameters.FEATURES_OFFSET, features);
|
||||||
}
|
}
|
||||||
|
|
||||||
progress.progress(TaskSteps.FORCE);
|
progress.progress(TaskSteps.FORCE);
|
||||||
|
@ -82,9 +82,19 @@ public class ForwardIndexReader {
|
|||||||
long offset = idxForDoc(docId);
|
long offset = idxForDoc(docId);
|
||||||
if (offset < 0) return 0;
|
if (offset < 0) return 0;
|
||||||
|
|
||||||
return (int) data.get(ENTRY_SIZE * offset + FEATURES_OFFSET);
|
return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) & 0xFFFF_FFFFL);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int getDocumentSize(long docId) {
|
||||||
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
|
long offset = idxForDoc(docId);
|
||||||
|
if (offset < 0) return 0;
|
||||||
|
|
||||||
|
return (int) (data.get(ENTRY_SIZE * offset + FEATURES_OFFSET) >>> 32L);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private int idxForDoc(long docId) {
|
private int idxForDoc(long docId) {
|
||||||
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
assert UrlIdCodec.getRank(docId) == 0 : "Forward Index Reader fed dirty reverse index id";
|
||||||
|
|
||||||
|
@ -79,6 +79,7 @@ class ForwardIndexConverterTest {
|
|||||||
writer.put(
|
writer.put(
|
||||||
new IndexJournalEntryHeader(createId(id, id/20),
|
new IndexJournalEntryHeader(createId(id, id/20),
|
||||||
id%3,
|
id%3,
|
||||||
|
15,
|
||||||
(id % 5)),
|
(id % 5)),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{},
|
new String[]{},
|
||||||
|
@ -17,14 +17,17 @@ import nu.marginalia.model.idx.DocumentMetadata;
|
|||||||
*/
|
*/
|
||||||
public record IndexJournalEntryHeader(int entrySize,
|
public record IndexJournalEntryHeader(int entrySize,
|
||||||
int documentFeatures,
|
int documentFeatures,
|
||||||
|
int documentSize,
|
||||||
long combinedId,
|
long combinedId,
|
||||||
long documentMeta) {
|
long documentMeta) {
|
||||||
|
|
||||||
public IndexJournalEntryHeader(long combinedId,
|
public IndexJournalEntryHeader(long combinedId,
|
||||||
int documentFeatures,
|
int documentFeatures,
|
||||||
|
int documentSize,
|
||||||
long documentMeta) {
|
long documentMeta) {
|
||||||
this(-1,
|
this(-1,
|
||||||
documentFeatures,
|
documentFeatures,
|
||||||
|
documentSize,
|
||||||
combinedId,
|
combinedId,
|
||||||
documentMeta);
|
documentMeta);
|
||||||
}
|
}
|
||||||
|
@ -28,12 +28,17 @@ public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData
|
|||||||
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
|
public static IndexJournalReadEntry read(DataInputStream inputStream) throws IOException {
|
||||||
|
|
||||||
final long sizeBlock = inputStream.readLong();
|
final long sizeBlock = inputStream.readLong();
|
||||||
|
final int entrySize = (int) (sizeBlock >>> 48L);
|
||||||
|
final int docSize = (int) ((sizeBlock >>> 32L) & 0xFFFFL);
|
||||||
|
final int docFeatures = (int) (sizeBlock & 0xFFFF_FFFFL);
|
||||||
final long docId = inputStream.readLong();
|
final long docId = inputStream.readLong();
|
||||||
final long meta = inputStream.readLong();
|
final long meta = inputStream.readLong();
|
||||||
|
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(
|
var header = new IndexJournalEntryHeader(
|
||||||
(int) (sizeBlock >>> 32L),
|
entrySize,
|
||||||
(int) (sizeBlock & 0xFFFF_FFFFL),
|
docFeatures,
|
||||||
|
docSize,
|
||||||
docId,
|
docId,
|
||||||
meta);
|
meta);
|
||||||
|
|
||||||
@ -57,6 +62,10 @@ public class IndexJournalReadEntry implements Iterable<IndexJournalEntryTermData
|
|||||||
return header.documentFeatures();
|
return header.documentFeatures();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int documentSize() {
|
||||||
|
return header.documentSize();
|
||||||
|
}
|
||||||
|
|
||||||
public int domainId() {
|
public int domainId() {
|
||||||
return UrlIdCodec.getDomainId(docId());
|
return UrlIdCodec.getDomainId(docId());
|
||||||
}
|
}
|
||||||
@ -88,7 +97,7 @@ class TermDataIterator implements Iterator<IndexJournalEntryTermData> {
|
|||||||
public IndexJournalEntryTermData next() {
|
public IndexJournalEntryTermData next() {
|
||||||
// read the metadata for the term
|
// read the metadata for the term
|
||||||
long termId = buffer.getLong();
|
long termId = buffer.getLong();
|
||||||
long meta = buffer.getLong();
|
long meta = buffer.getShort();
|
||||||
|
|
||||||
// read the size of the sequence data
|
// read the size of the sequence data
|
||||||
int size = buffer.get() & 0xFF;
|
int size = buffer.get() & 0xFF;
|
||||||
|
@ -13,7 +13,7 @@ public interface IndexJournalReader {
|
|||||||
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
int FILE_HEADER_SIZE_BYTES = 8 * FILE_HEADER_SIZE_LONGS;
|
||||||
|
|
||||||
int DOCUMENT_HEADER_SIZE_BYTES = 24;
|
int DOCUMENT_HEADER_SIZE_BYTES = 24;
|
||||||
int TERM_HEADER_SIZE_BYTES = 17;
|
int TERM_HEADER_SIZE_BYTES = 11;
|
||||||
|
|
||||||
/** Create a reader for a single file. */
|
/** Create a reader for a single file. */
|
||||||
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
static IndexJournalReader singleFile(Path fileName) throws IOException {
|
||||||
|
@ -97,6 +97,9 @@ class SingleFileJournalPointer implements IndexJournalPointer {
|
|||||||
@Override
|
@Override
|
||||||
public int documentFeatures() { return entry.documentFeatures(); }
|
public int documentFeatures() { return entry.documentFeatures(); }
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int documentSize() { return entry.documentSize(); }
|
||||||
|
|
||||||
/** Return an iterator over the terms in the current document.
|
/** Return an iterator over the terms in the current document.
|
||||||
* This iterator is not valid after calling nextDocument().
|
* This iterator is not valid after calling nextDocument().
|
||||||
*/
|
*/
|
||||||
|
@ -42,6 +42,8 @@ public interface IndexJournalPointer extends Iterable<IndexJournalEntryTermData>
|
|||||||
*/
|
*/
|
||||||
int documentFeatures();
|
int documentFeatures();
|
||||||
|
|
||||||
|
int documentSize();
|
||||||
|
|
||||||
/** Concatenate a number of journal pointers */
|
/** Concatenate a number of journal pointers */
|
||||||
static IndexJournalPointer concatenate(IndexJournalPointer... pointers) {
|
static IndexJournalPointer concatenate(IndexJournalPointer... pointers) {
|
||||||
if (pointers.length == 1)
|
if (pointers.length == 1)
|
||||||
@ -94,6 +96,11 @@ class JoiningJournalPointer implements IndexJournalPointer {
|
|||||||
return pointers[pIndex].documentFeatures();
|
return pointers[pIndex].documentFeatures();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int documentSize() {
|
||||||
|
return pointers[pIndex].documentSize();
|
||||||
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
@Override
|
@Override
|
||||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||||
@ -146,6 +153,12 @@ class FilteringJournalPointer implements IndexJournalPointer {
|
|||||||
return base.documentFeatures();
|
return base.documentFeatures();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int documentSize() {
|
||||||
|
return base.documentSize();
|
||||||
|
}
|
||||||
|
|
||||||
@NotNull
|
@NotNull
|
||||||
@Override
|
@Override
|
||||||
public Iterator<IndexJournalEntryTermData> iterator() {
|
public Iterator<IndexJournalEntryTermData> iterator() {
|
||||||
|
@ -2,7 +2,6 @@ package nu.marginalia.index.journal.writer;
|
|||||||
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
||||||
|
@ -81,12 +81,6 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
|||||||
public int put(IndexJournalEntryHeader header,
|
public int put(IndexJournalEntryHeader header,
|
||||||
IndexJournalEntryData data)
|
IndexJournalEntryData data)
|
||||||
{
|
{
|
||||||
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
|
||||||
dataBuffer.flip();
|
|
||||||
compressingStream.compress(dataBuffer);
|
|
||||||
dataBuffer.clear();
|
|
||||||
}
|
|
||||||
|
|
||||||
final long[] keywords = data.termIds();
|
final long[] keywords = data.termIds();
|
||||||
final long[] metadata = data.metadata();
|
final long[] metadata = data.metadata();
|
||||||
final var positions = data.positions();
|
final var positions = data.positions();
|
||||||
@ -94,16 +88,30 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
|||||||
int recordSize = 0; // document header size is 3 longs
|
int recordSize = 0; // document header size is 3 longs
|
||||||
for (int i = 0; i < keywords.length; i++) {
|
for (int i = 0; i < keywords.length; i++) {
|
||||||
// term header size is 2 longs
|
// term header size is 2 longs
|
||||||
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
|
recordSize += IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize();
|
||||||
}
|
}
|
||||||
|
|
||||||
dataBuffer.putInt(recordSize);
|
if (recordSize > Short.MAX_VALUE) {
|
||||||
|
// This should never happen, but if it does, we should log it and deal with it in a way that doesn't corrupt the file
|
||||||
|
// (32 KB is *a lot* of data for a single document, larger than the uncompressed HTML of most documents)
|
||||||
|
logger.error("Omitting entry: Record size {} exceeds maximum representable size of {}", recordSize, Short.MAX_VALUE);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (dataBuffer.capacity() - dataBuffer.position() < 3*8) {
|
||||||
|
dataBuffer.flip();
|
||||||
|
compressingStream.compress(dataBuffer);
|
||||||
|
dataBuffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
dataBuffer.putShort((short) recordSize);
|
||||||
|
dataBuffer.putShort((short) Math.clamp(0, header.documentSize(), Short.MAX_VALUE));
|
||||||
dataBuffer.putInt(header.documentFeatures());
|
dataBuffer.putInt(header.documentFeatures());
|
||||||
dataBuffer.putLong(header.combinedId());
|
dataBuffer.putLong(header.combinedId());
|
||||||
dataBuffer.putLong(header.documentMeta());
|
dataBuffer.putLong(header.documentMeta());
|
||||||
|
|
||||||
for (int i = 0; i < keywords.length; i++) {
|
for (int i = 0; i < keywords.length; i++) {
|
||||||
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].size();
|
int requiredSize = IndexJournalReader.TERM_HEADER_SIZE_BYTES + positions[i].bufferSize();
|
||||||
|
|
||||||
if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) {
|
if (dataBuffer.capacity() - dataBuffer.position() < requiredSize) {
|
||||||
dataBuffer.flip();
|
dataBuffer.flip();
|
||||||
@ -112,8 +120,8 @@ public class IndexJournalWriterSingleFileImpl implements IndexJournalWriter{
|
|||||||
}
|
}
|
||||||
|
|
||||||
dataBuffer.putLong(keywords[i]);
|
dataBuffer.putLong(keywords[i]);
|
||||||
dataBuffer.putLong(metadata[i]);
|
dataBuffer.putShort((short) metadata[i]);
|
||||||
dataBuffer.put((byte) positions[i].size());
|
dataBuffer.put((byte) positions[i].bufferSize());
|
||||||
dataBuffer.put(positions[i].buffer());
|
dataBuffer.put(positions[i].buffer());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
package nu.marginalia.index.journal;
|
package nu.marginalia.index.journal;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.ints.IntList;
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongList;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
@ -8,6 +10,11 @@ import nu.marginalia.index.journal.model.IndexJournalEntryTermData;
|
|||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderPagingImpl;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
||||||
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
import nu.marginalia.index.journal.reader.IndexJournalReaderSingleFile;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
@ -18,8 +25,9 @@ import java.io.IOException;
|
|||||||
import java.nio.ByteBuffer;
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Iterator;
|
import java.util.*;
|
||||||
import java.util.List;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
@ -52,7 +60,7 @@ public class IndexJournalWriterTest {
|
|||||||
public void testSingleFile() {
|
public void testSingleFile() {
|
||||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||||
// Write two documents with two terms each
|
// Write two documents with two terms each
|
||||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{44, 55},
|
new long[]{44, 55},
|
||||||
@ -61,7 +69,7 @@ public class IndexJournalWriterTest {
|
|||||||
gcs(2, 4, 6),
|
gcs(2, 4, 6),
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{45, 56},
|
new long[]{45, 56},
|
||||||
@ -90,6 +98,7 @@ public class IndexJournalWriterTest {
|
|||||||
assertEquals(11, ptr.documentId());
|
assertEquals(11, ptr.documentId());
|
||||||
assertEquals(22, ptr.documentFeatures());
|
assertEquals(22, ptr.documentFeatures());
|
||||||
assertEquals(33, ptr.documentMeta());
|
assertEquals(33, ptr.documentMeta());
|
||||||
|
assertEquals(10, ptr.documentSize());
|
||||||
|
|
||||||
iter = ptr.iterator();
|
iter = ptr.iterator();
|
||||||
|
|
||||||
@ -116,6 +125,7 @@ public class IndexJournalWriterTest {
|
|||||||
assertEquals(12, ptr.documentId());
|
assertEquals(12, ptr.documentId());
|
||||||
assertEquals(23, ptr.documentFeatures());
|
assertEquals(23, ptr.documentFeatures());
|
||||||
assertEquals(34, ptr.documentMeta());
|
assertEquals(34, ptr.documentMeta());
|
||||||
|
assertEquals(11, ptr.documentSize());
|
||||||
|
|
||||||
iter = ptr.iterator();
|
iter = ptr.iterator();
|
||||||
// Term 1
|
// Term 1
|
||||||
@ -147,7 +157,7 @@ public class IndexJournalWriterTest {
|
|||||||
@Test
|
@Test
|
||||||
public void testMultiFile() {
|
public void testMultiFile() {
|
||||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{44, 55},
|
new long[]{44, 55},
|
||||||
@ -162,7 +172,7 @@ public class IndexJournalWriterTest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
|
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile2)) {
|
||||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{45, 56},
|
new long[]{45, 56},
|
||||||
@ -191,6 +201,7 @@ public class IndexJournalWriterTest {
|
|||||||
assertEquals(11, ptr.documentId());
|
assertEquals(11, ptr.documentId());
|
||||||
assertEquals(22, ptr.documentFeatures());
|
assertEquals(22, ptr.documentFeatures());
|
||||||
assertEquals(33, ptr.documentMeta());
|
assertEquals(33, ptr.documentMeta());
|
||||||
|
assertEquals(10, ptr.documentSize());
|
||||||
|
|
||||||
iter = ptr.iterator();
|
iter = ptr.iterator();
|
||||||
|
|
||||||
@ -217,6 +228,7 @@ public class IndexJournalWriterTest {
|
|||||||
assertEquals(12, ptr.documentId());
|
assertEquals(12, ptr.documentId());
|
||||||
assertEquals(23, ptr.documentFeatures());
|
assertEquals(23, ptr.documentFeatures());
|
||||||
assertEquals(34, ptr.documentMeta());
|
assertEquals(34, ptr.documentMeta());
|
||||||
|
assertEquals(11, ptr.documentSize());
|
||||||
|
|
||||||
iter = ptr.iterator();
|
iter = ptr.iterator();
|
||||||
// Term 1
|
// Term 1
|
||||||
@ -249,7 +261,7 @@ public class IndexJournalWriterTest {
|
|||||||
public void testSingleFileIterTwice() {
|
public void testSingleFileIterTwice() {
|
||||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||||
// Write two documents with two terms each
|
// Write two documents with two terms each
|
||||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{44, 55},
|
new long[]{44, 55},
|
||||||
@ -277,6 +289,7 @@ public class IndexJournalWriterTest {
|
|||||||
assertTrue(ptr.nextDocument());
|
assertTrue(ptr.nextDocument());
|
||||||
assertEquals(11, ptr.documentId());
|
assertEquals(11, ptr.documentId());
|
||||||
assertEquals(22, ptr.documentFeatures());
|
assertEquals(22, ptr.documentFeatures());
|
||||||
|
assertEquals(10, ptr.documentSize());
|
||||||
assertEquals(33, ptr.documentMeta());
|
assertEquals(33, ptr.documentMeta());
|
||||||
|
|
||||||
iter = ptr.iterator();
|
iter = ptr.iterator();
|
||||||
@ -307,7 +320,7 @@ public class IndexJournalWriterTest {
|
|||||||
public void testFiltered() {
|
public void testFiltered() {
|
||||||
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||||
// Write two documents with two terms each
|
// Write two documents with two terms each
|
||||||
writer.put(new IndexJournalEntryHeader(11, 22, 33),
|
writer.put(new IndexJournalEntryHeader(11, 22, 10, 33),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{44, 55},
|
new long[]{44, 55},
|
||||||
@ -316,7 +329,7 @@ public class IndexJournalWriterTest {
|
|||||||
gcs(2, 4, 6),
|
gcs(2, 4, 6),
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
writer.put(new IndexJournalEntryHeader(12, 23, 34),
|
writer.put(new IndexJournalEntryHeader(12, 23, 11, 34),
|
||||||
new IndexJournalEntryData(
|
new IndexJournalEntryData(
|
||||||
new String[]{"word1", "word2"},
|
new String[]{"word1", "word2"},
|
||||||
new long[]{45, 56},
|
new long[]{45, 56},
|
||||||
@ -344,6 +357,7 @@ public class IndexJournalWriterTest {
|
|||||||
assertEquals(12, ptr.documentId());
|
assertEquals(12, ptr.documentId());
|
||||||
assertEquals(23, ptr.documentFeatures());
|
assertEquals(23, ptr.documentFeatures());
|
||||||
assertEquals(34, ptr.documentMeta());
|
assertEquals(34, ptr.documentMeta());
|
||||||
|
assertEquals(11, ptr.documentSize());
|
||||||
|
|
||||||
iter = ptr.iterator();
|
iter = ptr.iterator();
|
||||||
// Term 1
|
// Term 1
|
||||||
@ -364,4 +378,72 @@ public class IndexJournalWriterTest {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testIntegrationScenario() throws IOException {
|
||||||
|
Map<Long, Integer> wordMap = new HashMap<>();
|
||||||
|
for (int i = 0; i < 512; i++) {
|
||||||
|
wordMap.put(hasher.hashKeyword(Integer.toString(i)), i);
|
||||||
|
}
|
||||||
|
try (var writer = new IndexJournalWriterSingleFileImpl(tempFile)) {
|
||||||
|
for (int idc = 1; idc < 512; idc++) {
|
||||||
|
int id = idc;
|
||||||
|
int[] factors = IntStream
|
||||||
|
.rangeClosed(1, id)
|
||||||
|
.filter(v -> (id % v) == 0)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
|
||||||
|
|
||||||
|
long fullId = UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||||
|
|
||||||
|
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
|
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
|
||||||
|
long[] metadata = new long[factors.length];
|
||||||
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||||
|
}
|
||||||
|
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(16);
|
||||||
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
positions[i] = GammaCodedSequence.generate(wa, i + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try (var ptr = new IndexJournalReaderSingleFile(tempFile).newPointer()) {
|
||||||
|
while (ptr.nextDocument()) {
|
||||||
|
int ordinal = UrlIdCodec.getDocumentOrdinal(ptr.documentId());
|
||||||
|
System.out.println(ordinal);
|
||||||
|
|
||||||
|
var expectedFactors =
|
||||||
|
new LongArrayList(IntStream
|
||||||
|
.rangeClosed(1, ordinal)
|
||||||
|
.filter(v -> (ordinal % v) == 0)
|
||||||
|
.mapToObj(Integer::toString)
|
||||||
|
.mapToLong(hasher::hashKeyword)
|
||||||
|
.toArray());
|
||||||
|
|
||||||
|
LongList foundIds = new LongArrayList();
|
||||||
|
|
||||||
|
var iter = ptr.iterator();
|
||||||
|
while (iter.hasNext()) {
|
||||||
|
var termData = iter.next();
|
||||||
|
foundIds.add(termData.termId());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!expectedFactors.equals(foundIds)) {
|
||||||
|
System.out.println("Found: ");
|
||||||
|
System.out.println(foundIds.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
|
||||||
|
System.out.println("Expected: ");
|
||||||
|
System.out.println(expectedFactors.stream().map(fac -> wordMap.getOrDefault(fac, -1)).map(Objects::toString).collect(Collectors.joining(",")));
|
||||||
|
fail();
|
||||||
|
}
|
||||||
|
assertEquals(expectedFactors, foundIds);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,8 @@ package nu.marginalia.index;
|
|||||||
import nu.marginalia.array.LongArray;
|
import nu.marginalia.array.LongArray;
|
||||||
import nu.marginalia.array.LongArrayFactory;
|
import nu.marginalia.array.LongArrayFactory;
|
||||||
import nu.marginalia.btree.BTreeReader;
|
import nu.marginalia.btree.BTreeReader;
|
||||||
|
import nu.marginalia.index.positions.TermData;
|
||||||
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
import nu.marginalia.index.query.EmptyEntrySource;
|
import nu.marginalia.index.query.EmptyEntrySource;
|
||||||
import nu.marginalia.index.query.EntrySource;
|
import nu.marginalia.index.query.EntrySource;
|
||||||
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
import nu.marginalia.index.query.ReverseIndexRejectFilter;
|
||||||
@ -14,9 +16,9 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
|
||||||
public class ReverseIndexReader {
|
public class ReverseIndexReader {
|
||||||
@ -27,9 +29,16 @@ public class ReverseIndexReader {
|
|||||||
private final BTreeReader wordsBTreeReader;
|
private final BTreeReader wordsBTreeReader;
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
public ReverseIndexReader(String name, Path words, Path documents) throws IOException {
|
private final PositionsFileReader positionsFileReader;
|
||||||
|
|
||||||
|
public ReverseIndexReader(String name,
|
||||||
|
Path words,
|
||||||
|
Path documents,
|
||||||
|
PositionsFileReader positionsFileReader) throws IOException {
|
||||||
this.name = name;
|
this.name = name;
|
||||||
|
|
||||||
|
this.positionsFileReader = positionsFileReader;
|
||||||
|
|
||||||
if (!Files.exists(words) || !Files.exists(documents)) {
|
if (!Files.exists(words) || !Files.exists(documents)) {
|
||||||
this.words = null;
|
this.words = null;
|
||||||
this.documents = null;
|
this.documents = null;
|
||||||
@ -133,31 +142,29 @@ public class ReverseIndexReader {
|
|||||||
offset);
|
offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
public long[] getTermMeta(long termId, long[] docIds) {
|
public TermData[] getTermData(Arena arena,
|
||||||
|
long termId,
|
||||||
|
long[] docIds)
|
||||||
|
{
|
||||||
|
var ret = new TermData[docIds.length];
|
||||||
|
|
||||||
long offset = wordOffset(termId);
|
long offset = wordOffset(termId);
|
||||||
|
|
||||||
if (offset < 0) {
|
if (offset < 0) {
|
||||||
// This is likely a bug in the code, but we can't throw an exception here
|
// This is likely a bug in the code, but we can't throw an exception here
|
||||||
logger.debug("Missing offset for word {}", termId);
|
logger.debug("Missing offset for word {}", termId);
|
||||||
return new long[docIds.length];
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
assert isUniqueAndSorted(docIds) : "The input array docIds is assumed to be unique and sorted, was " + Arrays.toString(docIds);
|
|
||||||
|
|
||||||
var reader = createReaderNew(offset);
|
var reader = createReaderNew(offset);
|
||||||
return reader.queryData(docIds, 1);
|
|
||||||
|
// Read the size and offset of the position data
|
||||||
|
var offsets = reader.queryData(docIds, 1);
|
||||||
|
|
||||||
|
for (int i = 0; i < docIds.length; i++) {
|
||||||
|
ret[i] = positionsFileReader.getTermData(arena, offsets[i]);
|
||||||
}
|
}
|
||||||
|
return ret;
|
||||||
private boolean isUniqueAndSorted(long[] ids) {
|
|
||||||
if (ids.length == 0)
|
|
||||||
return true;
|
|
||||||
|
|
||||||
for (int i = 1; i < ids.length; i++) {
|
|
||||||
if(ids[i] <= ids[i-1])
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() {
|
public void close() {
|
||||||
@ -166,5 +173,14 @@ public class ReverseIndexReader {
|
|||||||
|
|
||||||
if (words != null)
|
if (words != null)
|
||||||
words.close();
|
words.close();
|
||||||
|
|
||||||
|
if (positionsFileReader != null) {
|
||||||
|
try {
|
||||||
|
positionsFileReader.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
logger.error("Failed to close positions file reader", e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
package nu.marginalia.index.construction;
|
package nu.marginalia.index.construction;
|
||||||
|
|
||||||
|
import nu.marginalia.index.positions.PositionCodec;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
@ -38,7 +39,7 @@ public class PositionsFileConstructor implements AutoCloseable {
|
|||||||
/** Add a term to the positions file
|
/** Add a term to the positions file
|
||||||
* @param termMeta the term metadata
|
* @param termMeta the term metadata
|
||||||
* @param positions the positions of the term
|
* @param positions the positions of the term
|
||||||
* @return the offset of the term in the file
|
* @return the offset of the term in the file, with the size of the data in the highest byte
|
||||||
*/
|
*/
|
||||||
public long add(byte termMeta, GammaCodedSequence positions) throws IOException {
|
public long add(byte termMeta, GammaCodedSequence positions) throws IOException {
|
||||||
synchronized (file) {
|
synchronized (file) {
|
||||||
@ -53,12 +54,20 @@ public class PositionsFileConstructor implements AutoCloseable {
|
|||||||
workBuffer.put(termMeta);
|
workBuffer.put(termMeta);
|
||||||
workBuffer.put(positionBuffer);
|
workBuffer.put(positionBuffer);
|
||||||
|
|
||||||
|
long ret = PositionCodec.encode(size, offset);
|
||||||
|
|
||||||
offset += size;
|
offset += size;
|
||||||
return offset;
|
|
||||||
|
return ret;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
while (workBuffer.position() < workBuffer.limit()) {
|
||||||
|
workBuffer.flip();
|
||||||
|
channel.write(workBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
channel.force(false);
|
channel.force(false);
|
||||||
channel.close();
|
channel.close();
|
||||||
}
|
}
|
||||||
|
@ -7,7 +7,6 @@ import org.slf4j.Logger;
|
|||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
|
|
||||||
|
@ -21,12 +21,14 @@ import java.util.concurrent.TimeUnit;
|
|||||||
* the associated ReversePreindexWordSegments data
|
* the associated ReversePreindexWordSegments data
|
||||||
*/
|
*/
|
||||||
public class ReversePreindexDocuments {
|
public class ReversePreindexDocuments {
|
||||||
private static PositionsFileConstructor positionsFileConstructor;
|
|
||||||
final Path file;
|
|
||||||
public final LongArray documents;
|
public final LongArray documents;
|
||||||
|
|
||||||
|
private static PositionsFileConstructor positionsFileConstructor;
|
||||||
private static final int RECORD_SIZE_LONGS = 2;
|
private static final int RECORD_SIZE_LONGS = 2;
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
private static final Logger logger = LoggerFactory.getLogger(ReversePreindexDocuments.class);
|
||||||
|
|
||||||
|
public final Path file;
|
||||||
|
|
||||||
public ReversePreindexDocuments(LongArray documents, Path file) {
|
public ReversePreindexDocuments(LongArray documents, Path file) {
|
||||||
this.documents = documents;
|
this.documents = documents;
|
||||||
this.file = file;
|
this.file = file;
|
||||||
@ -70,22 +72,25 @@ public class ReversePreindexDocuments {
|
|||||||
|
|
||||||
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
long fileSizeLongs = RECORD_SIZE_LONGS * segments.totalSize();
|
||||||
|
|
||||||
try (RandomFileAssembler assembly = RandomFileAssembler.create(workDir, fileSizeLongs)) {
|
try (var assembly = RandomFileAssembler.create(workDir, fileSizeLongs);
|
||||||
|
var pointer = reader.newPointer())
|
||||||
|
{
|
||||||
|
|
||||||
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
var offsetMap = segments.asMap(RECORD_SIZE_LONGS);
|
||||||
offsetMap.defaultReturnValue(0);
|
offsetMap.defaultReturnValue(0);
|
||||||
|
|
||||||
var pointer = reader.newPointer();
|
|
||||||
while (pointer.nextDocument()) {
|
while (pointer.nextDocument()) {
|
||||||
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
|
long rankEncodedId = docIdRewriter.rewriteDocId(pointer.documentId());
|
||||||
for (var termData : pointer) {
|
for (var termData : pointer) {
|
||||||
long termId = termData.termId();
|
long termId = termData.termId();
|
||||||
|
|
||||||
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
long offset = offsetMap.addTo(termId, RECORD_SIZE_LONGS);
|
||||||
long posOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
|
|
||||||
|
// write position data to the positions file and get the offset
|
||||||
|
long encodedPosOffset = positionsFileConstructor.add((byte) termData.metadata(), termData.positions());
|
||||||
|
|
||||||
assembly.put(offset + 0, rankEncodedId);
|
assembly.put(offset + 0, rankEncodedId);
|
||||||
assembly.put(offset + 1, posOffset);
|
assembly.put(offset + 1, encodedPosOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -0,0 +1,25 @@
|
|||||||
|
package nu.marginalia.index.positions;
|
||||||
|
|
||||||
|
/** A utility class for encoding and decoding position data offsets,
|
||||||
|
* the data is encoded by using the highest 16 bits to store the offset,
|
||||||
|
* and the remaining 48 bits to store the size of the data.
|
||||||
|
* <p></p>
|
||||||
|
* This lets us address 256 TB of data, with up to 64 KB of position data for each term,
|
||||||
|
* which is ample headroom for both the size of the data and the number of positions.
|
||||||
|
* */
|
||||||
|
public class PositionCodec {
|
||||||
|
|
||||||
|
public static long encode(int length, long offset) {
|
||||||
|
assert decodeSize(offset) == 0 : "Offset must be less than 2^48";
|
||||||
|
|
||||||
|
return (long) length << 48 | offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int decodeSize(long sizeEncodedOffset) {
|
||||||
|
return (int) ((sizeEncodedOffset & 0xFFFF_0000_0000_0000L) >>> 48);
|
||||||
|
}
|
||||||
|
public static long decodeOffset(long sizeEncodedOffset) {
|
||||||
|
return sizeEncodedOffset & 0x0000_FFFF_FFFF_FFFFL;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,39 @@
|
|||||||
|
package nu.marginalia.index.positions;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.nio.channels.FileChannel;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardOpenOption;
|
||||||
|
|
||||||
|
public class PositionsFileReader implements AutoCloseable {
|
||||||
|
private final FileChannel positions;
|
||||||
|
|
||||||
|
public PositionsFileReader(Path positionsFile) throws IOException {
|
||||||
|
this.positions = FileChannel.open(positionsFile, StandardOpenOption.READ);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Get the positions for a term in the index, as pointed out by the encoded offset;
|
||||||
|
* intermediate buffers are allocated from the provided arena allocator. */
|
||||||
|
public TermData getTermData(Arena arena, long sizeEncodedOffset) {
|
||||||
|
int length = PositionCodec.decodeSize(sizeEncodedOffset);
|
||||||
|
long offset = PositionCodec.decodeOffset(sizeEncodedOffset);
|
||||||
|
|
||||||
|
var segment = arena.allocate(length);
|
||||||
|
var buffer = segment.asByteBuffer();
|
||||||
|
|
||||||
|
try {
|
||||||
|
positions.read(buffer, offset);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
|
||||||
|
return new TermData(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void close() throws IOException {
|
||||||
|
positions.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -0,0 +1,21 @@
|
|||||||
|
package nu.marginalia.index.positions;
|
||||||
|
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
|
||||||
|
public class TermData {
|
||||||
|
private final ByteBuffer buffer;
|
||||||
|
|
||||||
|
public TermData(ByteBuffer buffer) {
|
||||||
|
this.buffer = buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
public byte flags() {
|
||||||
|
return buffer.get(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
public GammaCodedSequence positions() {
|
||||||
|
return new GammaCodedSequence(buffer, 1, buffer.capacity());
|
||||||
|
}
|
||||||
|
}
|
@ -0,0 +1,63 @@
|
|||||||
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
|
import nu.marginalia.index.positions.TermData;
|
||||||
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class PositionsFileReaderTest {
|
||||||
|
|
||||||
|
Path file;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
void setUp() throws IOException {
|
||||||
|
file = Files.createTempFile("positions", "dat");
|
||||||
|
}
|
||||||
|
@AfterEach
|
||||||
|
void tearDown() throws IOException {
|
||||||
|
Files.delete(file);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void getTermData() throws IOException {
|
||||||
|
ByteBuffer workArea = ByteBuffer.allocate(8192);
|
||||||
|
long key1, key2, key3;
|
||||||
|
try (PositionsFileConstructor constructor = new PositionsFileConstructor(file)) {
|
||||||
|
key1 = constructor.add((byte) 43, GammaCodedSequence.generate(workArea, 1, 2, 3));
|
||||||
|
key2 = constructor.add((byte) 51, GammaCodedSequence.generate(workArea, 2, 3, 5, 1000, 5000, 20241));
|
||||||
|
key3 = constructor.add((byte) 61, GammaCodedSequence.generate(workArea, 3, 5, 7));
|
||||||
|
}
|
||||||
|
|
||||||
|
System.out.println("key1: " + Long.toHexString(key1));
|
||||||
|
System.out.println("key2: " + Long.toHexString(key2));
|
||||||
|
System.out.println("key3: " + Long.toHexString(key3));
|
||||||
|
|
||||||
|
try (Arena arena = Arena.ofConfined();
|
||||||
|
PositionsFileReader reader = new PositionsFileReader(file))
|
||||||
|
{
|
||||||
|
TermData data1 = reader.getTermData(arena, key1);
|
||||||
|
assertEquals(43, data1.flags());
|
||||||
|
assertEquals(IntList.of( 1, 2, 3), data1.positions().values());
|
||||||
|
|
||||||
|
TermData data2 = reader.getTermData(arena, key2);
|
||||||
|
assertEquals(51, data2.flags());
|
||||||
|
assertEquals(IntList.of(2, 3, 5, 1000, 5000, 20241), data2.positions().values());
|
||||||
|
|
||||||
|
TermData data3 = reader.getTermData(arena, key3);
|
||||||
|
assertEquals(61, data3.flags());
|
||||||
|
assertEquals(IntList.of(3, 5, 7), data3.positions().values());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,17 +1,19 @@
|
|||||||
package nu.marginalia.index;
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
import nu.marginalia.array.page.LongQueryBuffer;
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.PositionsFileConstructor;
|
import nu.marginalia.index.construction.PositionsFileConstructor;
|
||||||
import nu.marginalia.index.construction.ReversePreindex;
|
import nu.marginalia.index.construction.ReversePreindex;
|
||||||
import nu.marginalia.index.construction.TestJournalFactory;
|
import nu.marginalia.index.construction.TestJournalFactory;
|
||||||
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
import nu.marginalia.index.construction.TestJournalFactory.EntryDataWithWordMeta;
|
||||||
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
import org.junit.jupiter.api.AfterEach;
|
import org.junit.jupiter.api.AfterEach;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
import org.mockito.Mockito;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -47,13 +49,18 @@ class ReverseIndexReaderTest {
|
|||||||
public void testSimple() throws IOException {
|
public void testSimple() throws IOException {
|
||||||
|
|
||||||
var indexReader = createIndex(
|
var indexReader = createIndex(
|
||||||
new EntryDataWithWordMeta(100, 101, wm(50, 51))
|
new EntryDataWithWordMeta(100, 101, wm(50, 51, 1, 3, 5))
|
||||||
);
|
);
|
||||||
|
|
||||||
assertEquals(1, indexReader.numDocuments(50));
|
assertEquals(1, indexReader.numDocuments(50));
|
||||||
|
|
||||||
long[] meta = indexReader.getTermMeta(50, new long[] { 100 });
|
var positions = indexReader.getTermData(Arena.global(), 50, new long[] { 100 });
|
||||||
assertArrayEquals(new long[] { 51 }, meta);
|
|
||||||
|
assertEquals(1, positions.length);
|
||||||
|
assertNotNull(positions[0]);
|
||||||
|
assertEquals((byte) 51, positions[0].flags());
|
||||||
|
assertEquals(IntList.of(1, 3, 5), positions[0].positions().values());
|
||||||
|
|
||||||
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -69,13 +76,8 @@ class ReverseIndexReaderTest {
|
|||||||
assertEquals(2, indexReader.numDocuments(51));
|
assertEquals(2, indexReader.numDocuments(51));
|
||||||
assertEquals(1, indexReader.numDocuments(52));
|
assertEquals(1, indexReader.numDocuments(52));
|
||||||
|
|
||||||
assertArrayEquals(new long[] { 51 }, indexReader.getTermMeta(50, new long[] { 100 }));
|
|
||||||
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
assertArrayEquals(new long[] { 100 }, readEntries(indexReader, 50));
|
||||||
|
|
||||||
assertArrayEquals(new long[] { 52, 53 }, indexReader.getTermMeta(51, new long[] { 100, 101 }));
|
|
||||||
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
|
assertArrayEquals(new long[] { 100, 101 }, readEntries(indexReader, 51));
|
||||||
|
|
||||||
assertArrayEquals(new long[] { 54 }, indexReader.getTermMeta(52, new long[] { 101 }));
|
|
||||||
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
|
assertArrayEquals(new long[] { 101 }, readEntries(indexReader, 52));
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -91,18 +93,20 @@ class ReverseIndexReaderTest {
|
|||||||
|
|
||||||
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
private ReverseIndexReader createIndex(EntryDataWithWordMeta... scenario) throws IOException {
|
||||||
var reader = journalFactory.createReader(scenario);
|
var reader = journalFactory.createReader(scenario);
|
||||||
var preindex = ReversePreindex.constructPreindex(reader,
|
|
||||||
Mockito.mock(PositionsFileConstructor.class),
|
|
||||||
DocIdRewriter.identity(), tempDir);
|
|
||||||
|
|
||||||
|
|
||||||
|
Path posFile = tempDir.resolve("positions.dat");
|
||||||
Path docsFile = tempDir.resolve("docs.dat");
|
Path docsFile = tempDir.resolve("docs.dat");
|
||||||
Path wordsFile = tempDir.resolve("words.dat");
|
Path wordsFile = tempDir.resolve("words.dat");
|
||||||
|
|
||||||
|
try (var positionsFileConstructor = new PositionsFileConstructor(posFile)) {
|
||||||
|
var preindex = ReversePreindex.constructPreindex(reader,
|
||||||
|
positionsFileConstructor,
|
||||||
|
DocIdRewriter.identity(), tempDir);
|
||||||
preindex.finalizeIndex(docsFile, wordsFile);
|
preindex.finalizeIndex(docsFile, wordsFile);
|
||||||
preindex.delete();
|
preindex.delete();
|
||||||
|
}
|
||||||
|
|
||||||
return new ReverseIndexReader("test", wordsFile, docsFile);
|
return new ReverseIndexReader("test", wordsFile, docsFile, new PositionsFileReader(posFile));
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -155,15 +155,15 @@ class ReversePreindexDocsTest {
|
|||||||
if (wordId != that.wordId) return false;
|
if (wordId != that.wordId) return false;
|
||||||
if (start != that.start) return false;
|
if (start != that.start) return false;
|
||||||
if (end != that.end) return false;
|
if (end != that.end) return false;
|
||||||
return Arrays.equals(data, that.data);
|
return data[0] == that.data[0]; //Arrays.equals(data, that.data);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
int result = (int) (wordId ^ (wordId >>> 32));
|
int result = Long.hashCode(wordId);
|
||||||
result = 31 * result + (int) (start ^ (start >>> 32));
|
result = 31 * result + Long.hashCode(start);
|
||||||
result = 31 * result + (int) (end ^ (end >>> 32));
|
result = 31 * result + Long.hashCode(end);
|
||||||
result = 31 * result + Arrays.hashCode(data);
|
result = 31 * result + Long.hashCode(data[0]);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,9 +79,7 @@ class ReversePreindexFinalizeTest {
|
|||||||
assertEquals(1, wordsHeader.numEntries());
|
assertEquals(1, wordsHeader.numEntries());
|
||||||
|
|
||||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||||
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
|
||||||
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||||
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -122,9 +120,7 @@ class ReversePreindexFinalizeTest {
|
|||||||
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
|
long offset2 = wordsArray.get(wordsHeader.dataOffsetLongs() + 3);
|
||||||
|
|
||||||
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||||
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
|
||||||
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
assertEquals(50, wordsArray.get(wordsHeader.dataOffsetLongs()));
|
||||||
assertEquals(0, wordsArray.get(wordsHeader.dataOffsetLongs() + 1));
|
|
||||||
|
|
||||||
BTreeHeader docsHeader;
|
BTreeHeader docsHeader;
|
||||||
|
|
||||||
@ -133,13 +129,11 @@ class ReversePreindexFinalizeTest {
|
|||||||
assertEquals(1, docsHeader.numEntries());
|
assertEquals(1, docsHeader.numEntries());
|
||||||
|
|
||||||
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
assertEquals(100, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||||
assertEquals(51, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
|
||||||
|
|
||||||
docsHeader = new BTreeHeader(docsArray, offset2);
|
docsHeader = new BTreeHeader(docsArray, offset2);
|
||||||
System.out.println(docsHeader);
|
System.out.println(docsHeader);
|
||||||
assertEquals(1, docsHeader.numEntries());
|
assertEquals(1, docsHeader.numEntries());
|
||||||
|
|
||||||
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
assertEquals(101, docsArray.get(docsHeader.dataOffsetLongs() + 0));
|
||||||
assertEquals(52, docsArray.get(docsHeader.dataOffsetLongs() + 1));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -8,11 +8,13 @@ import nu.marginalia.index.journal.writer.IndexJournalWriterSingleFileImpl;
|
|||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class TestJournalFactory {
|
public class TestJournalFactory {
|
||||||
Path tempDir = Files.createTempDirectory("journal");
|
Path tempDir = Files.createTempDirectory("journal");
|
||||||
@ -50,10 +52,10 @@ public class TestJournalFactory {
|
|||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
public record WordWithMeta(long wordId, long meta) {}
|
public record WordWithMeta(long wordId, long meta, GammaCodedSequence gcs) {}
|
||||||
|
|
||||||
public static WordWithMeta wm(long wordId, long meta) {
|
public static WordWithMeta wm(long wordId, long meta, int... positions) {
|
||||||
return new WordWithMeta(wordId, meta);
|
return new WordWithMeta(wordId, meta, GammaCodedSequence.generate(ByteBuffer.allocate(128), positions));
|
||||||
}
|
}
|
||||||
|
|
||||||
IndexJournalReader createReader(EntryData... entries) throws IOException {
|
IndexJournalReader createReader(EntryData... entries) throws IOException {
|
||||||
@ -71,7 +73,7 @@ public class TestJournalFactory {
|
|||||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
positions[i] = new GammaCodedSequence(new byte[1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
|
||||||
new IndexJournalEntryData(termIds, meta, positions));
|
new IndexJournalEntryData(termIds, meta, positions));
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
@ -91,10 +93,10 @@ public class TestJournalFactory {
|
|||||||
for (int i = 0; i < entry.wordIds.length; i++) {
|
for (int i = 0; i < entry.wordIds.length; i++) {
|
||||||
termIds[i] = entry.wordIds[i].wordId;
|
termIds[i] = entry.wordIds[i].wordId;
|
||||||
meta[i] = entry.wordIds[i].meta;
|
meta[i] = entry.wordIds[i].meta;
|
||||||
positions[i] = new GammaCodedSequence(new byte[1]);
|
positions[i] = Objects.requireNonNullElseGet(entry.wordIds[i].gcs, () -> new GammaCodedSequence(new byte[1]));
|
||||||
}
|
}
|
||||||
|
|
||||||
writer.put(new IndexJournalEntryHeader(entries.length, 0, entry.docId, entry.docMeta),
|
writer.put(new IndexJournalEntryHeader(entries.length, 0, 15, entry.docId, entry.docMeta),
|
||||||
new IndexJournalEntryData(termIds, meta, positions));
|
new IndexJournalEntryData(termIds, meta, positions));
|
||||||
}
|
}
|
||||||
writer.close();
|
writer.close();
|
||||||
|
@ -4,11 +4,10 @@ import com.google.inject.Inject;
|
|||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
|
import nu.marginalia.index.positions.PositionsFileReader;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
@ -40,17 +39,18 @@ public class IndexFactory {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public ReverseIndexReader getReverseIndexReader() throws IOException {
|
public ReverseIndexReader getReverseIndexReader() throws IOException {
|
||||||
|
|
||||||
return new ReverseIndexReader("full",
|
return new ReverseIndexReader("full",
|
||||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||||
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT)
|
ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.CURRENT),
|
||||||
|
new PositionsFileReader(ReverseIndexFullFileNames.resolve(liveStorage, ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.CURRENT))
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
public ReverseIndexReader getReverseIndexPrioReader() throws IOException {
|
||||||
return new ReverseIndexReader("prio",
|
return new ReverseIndexReader("prio",
|
||||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||||
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT)
|
ReverseIndexPrioFileNames.resolve(liveStorage, ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.CURRENT),
|
||||||
|
null
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -281,10 +281,7 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
|||||||
awaitCompletion();
|
awaitCompletion();
|
||||||
|
|
||||||
// Return the best results
|
// Return the best results
|
||||||
return new SearchResultSet(
|
return new SearchResultSet(resultValuator.selectBestResults(parameters, resultHeap));
|
||||||
resultValuator.selectBestResults(parameters,
|
|
||||||
resultRankingContext,
|
|
||||||
resultHeap));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Wait for all tasks to complete */
|
/** Wait for all tasks to complete */
|
||||||
|
@ -14,12 +14,13 @@ import nu.marginalia.index.query.IndexQueryBuilder;
|
|||||||
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
import nu.marginalia.index.query.filter.QueryFilterStepIf;
|
||||||
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
import nu.marginalia.index.query.limit.SpecificationLimitType;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
import java.time.Duration;
|
import java.time.Duration;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
@ -169,8 +170,11 @@ public class CombinedIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/** Retrieves the term metadata for the specified word for the provided documents */
|
/** Retrieves the term metadata for the specified word for the provided documents */
|
||||||
public DocMetadataList getMetadata(long wordId, CombinedDocIdList docIds) {
|
public TermMetadataList getTermMetadata(Arena arena,
|
||||||
return new DocMetadataList(reverseIndexFullReader.getTermMeta(wordId, docIds.array()));
|
long wordId,
|
||||||
|
CombinedDocIdList docIds)
|
||||||
|
{
|
||||||
|
return new TermMetadataList(reverseIndexFullReader.getTermData(arena, wordId, docIds.array()));
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Retrieves the document metadata for the specified document */
|
/** Retrieves the document metadata for the specified document */
|
||||||
@ -186,8 +190,12 @@ public class CombinedIndexReader {
|
|||||||
/** Retrieves the HTML features for the specified document */
|
/** Retrieves the HTML features for the specified document */
|
||||||
public int getHtmlFeatures(long docId) {
|
public int getHtmlFeatures(long docId) {
|
||||||
return forwardIndexReader.getHtmlFeatures(docId);
|
return forwardIndexReader.getHtmlFeatures(docId);
|
||||||
|
} /** Retrieves the HTML features for the specified document */
|
||||||
|
public int getDocumentSize(long docId) {
|
||||||
|
return forwardIndexReader.getDocumentSize(docId);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/** Close the indexes (this is not done immediately)
|
/** Close the indexes (this is not done immediately)
|
||||||
* */
|
* */
|
||||||
public void close() throws InterruptedException {
|
public void close() throws InterruptedException {
|
||||||
|
@ -10,12 +10,13 @@ import nu.marginalia.index.index.StatefulIndex;
|
|||||||
import nu.marginalia.index.model.SearchTermsUtil;
|
import nu.marginalia.index.model.SearchTermsUtil;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||||
import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
|
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
|
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||||
|
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
|
||||||
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
|
import static nu.marginalia.index.results.model.TermCoherenceGroupList.TermCoherenceGroup;
|
||||||
import static nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds.DocumentsWithMetadata;
|
|
||||||
|
|
||||||
public class IndexMetadataService {
|
public class IndexMetadataService {
|
||||||
private final StatefulIndex statefulIndex;
|
private final StatefulIndex statefulIndex;
|
||||||
@ -25,22 +26,19 @@ public class IndexMetadataService {
|
|||||||
this.statefulIndex = index;
|
this.statefulIndex = index;
|
||||||
}
|
}
|
||||||
|
|
||||||
public TermMetadataForCombinedDocumentIds getTermMetadataForDocuments(CombinedDocIdList combinedIdsAll,
|
public Long2ObjectArrayMap<TermMetadataList>
|
||||||
TermIdList termIdsList)
|
getTermMetadataForDocuments(Arena arena, CombinedDocIdList combinedIdsAll, TermIdList termIdsList)
|
||||||
{
|
{
|
||||||
var currentIndex = statefulIndex.get();
|
var currentIndex = statefulIndex.get();
|
||||||
|
|
||||||
Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta =
|
Long2ObjectArrayMap<TermMetadataList> termdocToMeta =
|
||||||
new Long2ObjectArrayMap<>(termIdsList.size());
|
new Long2ObjectArrayMap<>(termIdsList.size());
|
||||||
|
|
||||||
for (long termId : termIdsList.array()) {
|
for (long termId : termIdsList.array()) {
|
||||||
var metadata = currentIndex.getMetadata(termId, combinedIdsAll);
|
termdocToMeta.put(termId, currentIndex.getTermMetadata(arena, termId, combinedIdsAll));
|
||||||
|
|
||||||
termdocToMeta.put(termId,
|
|
||||||
new DocumentsWithMetadata(combinedIdsAll, metadata));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return new TermMetadataForCombinedDocumentIds(termdocToMeta);
|
return termdocToMeta;
|
||||||
}
|
}
|
||||||
|
|
||||||
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
public QuerySearchTerms getSearchTerms(CompiledQuery<String> compiledQuery, SearchQuery searchQuery) {
|
||||||
|
@ -1,25 +1,22 @@
|
|||||||
package nu.marginalia.index.results;
|
package nu.marginalia.index.results;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.*;
|
import nu.marginalia.api.searchquery.model.compiled.*;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
|
||||||
import nu.marginalia.index.index.CombinedIndexReader;
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
import nu.marginalia.index.index.StatefulIndex;
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
import nu.marginalia.index.model.SearchParameters;
|
import nu.marginalia.index.model.SearchParameters;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
|
||||||
import nu.marginalia.index.model.QueryParams;
|
import nu.marginalia.index.model.QueryParams;
|
||||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||||
import nu.marginalia.index.results.model.TermMetadataForCombinedDocumentIds;
|
|
||||||
import nu.marginalia.model.id.UrlIdCodec;
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
|
||||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||||
import nu.marginalia.ranking.results.ResultValuator;
|
import nu.marginalia.ranking.results.ResultValuator;
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
import javax.annotation.Nullable;
|
||||||
import java.util.List;
|
|
||||||
|
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.*;
|
||||||
|
|
||||||
/** This class is responsible for calculating the score of a search result.
|
/** This class is responsible for calculating the score of a search result.
|
||||||
* It holds the data required to perform the scoring, as there is strong
|
* It holds the data required to perform the scoring, as there is strong
|
||||||
@ -28,94 +25,74 @@ public class IndexResultValuationContext {
|
|||||||
private final CombinedIndexReader index;
|
private final CombinedIndexReader index;
|
||||||
private final QueryParams queryParams;
|
private final QueryParams queryParams;
|
||||||
|
|
||||||
private final TermMetadataForCombinedDocumentIds termMetadataForCombinedDocumentIds;
|
|
||||||
private final QuerySearchTerms searchTerms;
|
|
||||||
|
|
||||||
private final ResultRankingContext rankingContext;
|
private final ResultRankingContext rankingContext;
|
||||||
private final ResultValuator searchResultValuator;
|
private final ResultValuator searchResultValuator;
|
||||||
private final CompiledQuery<String> compiledQuery;
|
private final CompiledQuery<String> compiledQuery;
|
||||||
private final CompiledQueryLong compiledQueryIds;
|
|
||||||
|
|
||||||
public IndexResultValuationContext(IndexMetadataService metadataService,
|
public IndexResultValuationContext(ResultValuator searchResultValuator,
|
||||||
ResultValuator searchResultValuator,
|
|
||||||
CombinedDocIdList ids,
|
|
||||||
StatefulIndex statefulIndex,
|
StatefulIndex statefulIndex,
|
||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
SearchParameters params
|
SearchParameters params)
|
||||||
) {
|
{
|
||||||
this.index = statefulIndex.get();
|
this.index = statefulIndex.get();
|
||||||
this.rankingContext = rankingContext;
|
this.rankingContext = rankingContext;
|
||||||
this.searchResultValuator = searchResultValuator;
|
this.searchResultValuator = searchResultValuator;
|
||||||
|
|
||||||
this.queryParams = params.queryParams;
|
this.queryParams = params.queryParams;
|
||||||
this.compiledQuery = params.compiledQuery;
|
this.compiledQuery = params.compiledQuery;
|
||||||
this.compiledQueryIds = params.compiledQueryIds;
|
|
||||||
|
|
||||||
this.searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
|
||||||
|
|
||||||
this.termMetadataForCombinedDocumentIds = metadataService.getTermMetadataForDocuments(ids,
|
|
||||||
searchTerms.termIdsAll);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private final long flagsFilterMask =
|
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||||
WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
|
||||||
|
|
||||||
@Nullable
|
@Nullable
|
||||||
public SearchResultItem calculatePreliminaryScore(long combinedId) {
|
public SearchResultItem calculatePreliminaryScore(long combinedId,
|
||||||
|
QuerySearchTerms searchTerms,
|
||||||
|
long[] wordFlags,
|
||||||
|
GammaCodedSequence[] positions)
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
// FIXME: Reconsider coherence logic with the new position data
|
||||||
|
// if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
|
||||||
|
// return null;
|
||||||
|
|
||||||
|
CompiledQuery<GammaCodedSequence> positionsQuery = compiledQuery.root.newQuery(positions);
|
||||||
|
CompiledQueryLong wordFlagsQuery = compiledQuery.root.newQuery(wordFlags);
|
||||||
|
int[] counts = new int[compiledQuery.size()];
|
||||||
|
for (int i = 0; i < counts.length; i++) {
|
||||||
|
if (positions[i] != null) {
|
||||||
|
counts[i] = positions[i].valueCount();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CompiledQueryInt positionsCountQuery = compiledQuery.root.newQuery(counts);
|
||||||
|
|
||||||
|
// If the document is not relevant to the query, abort early to reduce allocations and
|
||||||
|
// avoid unnecessary calculations
|
||||||
|
if (testRelevance(wordFlagsQuery, positionsCountQuery)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
long docId = UrlIdCodec.removeRank(combinedId);
|
long docId = UrlIdCodec.removeRank(combinedId);
|
||||||
|
|
||||||
if (!searchTerms.coherences.test(termMetadataForCombinedDocumentIds, combinedId))
|
|
||||||
return null;
|
|
||||||
|
|
||||||
long docMetadata = index.getDocumentMetadata(docId);
|
long docMetadata = index.getDocumentMetadata(docId);
|
||||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||||
|
int docSize = index.getDocumentSize(docId);
|
||||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
|
||||||
docMetadata,
|
|
||||||
htmlFeatures,
|
|
||||||
hasPrioTerm(combinedId));
|
|
||||||
|
|
||||||
long[] wordMetas = new long[compiledQuery.size()];
|
|
||||||
SearchResultKeywordScore[] scores = new SearchResultKeywordScore[compiledQuery.size()];
|
|
||||||
|
|
||||||
for (int i = 0; i < wordMetas.length; i++) {
|
|
||||||
final long termId = compiledQueryIds.at(i);
|
|
||||||
final String term = compiledQuery.at(i);
|
|
||||||
|
|
||||||
wordMetas[i] = termMetadataForCombinedDocumentIds.getTermMetadata(termId, combinedId);
|
|
||||||
scores[i] = new SearchResultKeywordScore(term, termId, wordMetas[i]);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// DANGER: IndexResultValuatorService assumes that searchResult.keywordScores has this specific order, as it needs
|
|
||||||
// to be able to re-construct its own CompiledQuery<SearchResultKeywordScore> for re-ranking the results. This is
|
|
||||||
// a very flimsy assumption.
|
|
||||||
searchResult.keywordScores.addAll(List.of(scores));
|
|
||||||
|
|
||||||
CompiledQueryLong wordMetasQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
|
|
||||||
|
|
||||||
|
|
||||||
boolean allSynthetic = CompiledQueryAggregates.booleanAggregate(wordMetasQuery, WordFlags.Synthetic::isPresent);
|
|
||||||
int flagsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(wordMeta & flagsFilterMask));
|
|
||||||
int positionsCount = CompiledQueryAggregates.intMaxMinAggregate(wordMetasQuery, wordMeta -> Long.bitCount(WordMetadata.decodePositions(wordMeta)));
|
|
||||||
|
|
||||||
if (!meetsQueryStrategyRequirements(wordMetasQuery, queryParams.queryStrategy())) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (flagsCount == 0 && !allSynthetic && positionsCount == 0)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
double score = searchResultValuator.calculateSearchResultValue(
|
double score = searchResultValuator.calculateSearchResultValue(
|
||||||
wordMetasQuery,
|
wordFlagsQuery,
|
||||||
|
positionsCountQuery,
|
||||||
|
positionsQuery,
|
||||||
docMetadata,
|
docMetadata,
|
||||||
htmlFeatures,
|
htmlFeatures,
|
||||||
5000, // use a dummy value here as it's not present in the index
|
docSize,
|
||||||
rankingContext,
|
rankingContext,
|
||||||
null);
|
null);
|
||||||
|
|
||||||
if (searchResult.hasPrioTerm) {
|
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||||
|
docMetadata,
|
||||||
|
htmlFeatures);
|
||||||
|
|
||||||
|
if (hasPrioTerm(searchTerms, positions)) {
|
||||||
score = 0.75 * score;
|
score = 0.75 * score;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -124,12 +101,31 @@ public class IndexResultValuationContext {
|
|||||||
return searchResult;
|
return searchResult;
|
||||||
}
|
}
|
||||||
|
|
||||||
private boolean hasPrioTerm(long combinedId) {
|
private boolean testRelevance(CompiledQueryLong wordFlagsQuery, CompiledQueryInt countsQuery) {
|
||||||
for (var term : searchTerms.termIdsPrio.array()) {
|
boolean allSynthetic = booleanAggregate(wordFlagsQuery, WordFlags.Synthetic::isPresent);
|
||||||
if (termMetadataForCombinedDocumentIds.hasTermMeta(term, combinedId)) {
|
int flagsCount = intMaxMinAggregate(wordFlagsQuery, flags -> Long.bitCount(flags & flagsFilterMask));
|
||||||
|
int positionsCount = intMaxMinAggregate(countsQuery, p -> p);
|
||||||
|
|
||||||
|
if (!meetsQueryStrategyRequirements(wordFlagsQuery, queryParams.queryStrategy())) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
if (flagsCount == 0 && !allSynthetic && positionsCount == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
private boolean hasPrioTerm(QuerySearchTerms searchTerms, GammaCodedSequence[] positions) {
|
||||||
|
var allTerms = searchTerms.termIdsAll;
|
||||||
|
var prioTerms = searchTerms.termIdsPrio;
|
||||||
|
|
||||||
|
for (int i = 0; i < allTerms.size(); i++) {
|
||||||
|
if (positions[i] != null && prioTerms.contains(allTerms.at(i))) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -142,7 +138,7 @@ public class IndexResultValuationContext {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
return CompiledQueryAggregates.booleanAggregate(queryGraphScores,
|
return booleanAggregate(queryGraphScores,
|
||||||
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
|
docs -> meetsQueryStrategyRequirements(docs, queryParams.queryStrategy()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -7,8 +7,6 @@ import gnu.trove.list.array.TLongArrayList;
|
|||||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataLong;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
||||||
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
import nu.marginalia.api.searchquery.model.results.DecoratedSearchResultItem;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
@ -21,12 +19,13 @@ import nu.marginalia.linkdb.docs.DocumentDbReader;
|
|||||||
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
import nu.marginalia.ranking.results.ResultValuator;
|
import nu.marginalia.ranking.results.ResultValuator;
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
import java.sql.SQLException;
|
import java.sql.SQLException;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Consumer;
|
|
||||||
|
|
||||||
@Singleton
|
@Singleton
|
||||||
public class IndexResultValuatorService {
|
public class IndexResultValuatorService {
|
||||||
@ -53,12 +52,42 @@ public class IndexResultValuatorService {
|
|||||||
ResultRankingContext rankingContext,
|
ResultRankingContext rankingContext,
|
||||||
CombinedDocIdList resultIds)
|
CombinedDocIdList resultIds)
|
||||||
{
|
{
|
||||||
final var evaluator = createValuationContext(params, rankingContext, resultIds);
|
IndexResultValuationContext evaluator =
|
||||||
|
new IndexResultValuationContext(resultValuator, statefulIndex, rankingContext, params);
|
||||||
|
|
||||||
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
List<SearchResultItem> results = new ArrayList<>(resultIds.size());
|
||||||
|
|
||||||
for (long id : resultIds.array()) {
|
try (var arena = Arena.ofConfined()) {
|
||||||
var score = evaluator.calculatePreliminaryScore(id);
|
// Batch-fetch the word metadata for the documents
|
||||||
|
|
||||||
|
var searchTerms = metadataService.getSearchTerms(params.compiledQuery, params.query);
|
||||||
|
var termsForDocs = metadataService.getTermMetadataForDocuments(arena, resultIds, searchTerms.termIdsAll);
|
||||||
|
|
||||||
|
// Prepare data for the document. We do this outside of the calculation function to avoid
|
||||||
|
// hash lookups in the inner loop, as it's very hot code and we don't want thrashing in there;
|
||||||
|
// out here we can rely on implicit array ordering to match up the data.
|
||||||
|
|
||||||
|
var ra = resultIds.array();
|
||||||
|
long[] flags = new long[searchTerms.termIdsAll.size()];
|
||||||
|
GammaCodedSequence[] positions = new GammaCodedSequence[searchTerms.termIdsAll.size()];
|
||||||
|
|
||||||
|
for (int i = 0; i < ra.length; i++) {
|
||||||
|
long id = ra[i];
|
||||||
|
|
||||||
|
// Prepare term-level data for the document
|
||||||
|
for (int ti = 0; ti < flags.length; ti++) {
|
||||||
|
long tid = searchTerms.termIdsAll.at(ti);
|
||||||
|
var tfd = termsForDocs.get(tid);
|
||||||
|
|
||||||
|
assert tfd != null : "No term data for term " + ti;
|
||||||
|
|
||||||
|
flags[ti] = tfd.flag(i);
|
||||||
|
positions[ti] = tfd.position(i);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate the preliminary score
|
||||||
|
|
||||||
|
var score = evaluator.calculatePreliminaryScore(id, searchTerms, flags, positions);
|
||||||
if (score != null) {
|
if (score != null) {
|
||||||
results.add(score);
|
results.add(score);
|
||||||
}
|
}
|
||||||
@ -66,22 +95,10 @@ public class IndexResultValuatorService {
|
|||||||
|
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
private IndexResultValuationContext createValuationContext(SearchParameters params,
|
|
||||||
ResultRankingContext rankingContext,
|
|
||||||
CombinedDocIdList resultIds)
|
|
||||||
{
|
|
||||||
return new IndexResultValuationContext(metadataService,
|
|
||||||
resultValuator,
|
|
||||||
resultIds,
|
|
||||||
statefulIndex,
|
|
||||||
rankingContext,
|
|
||||||
params);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
|
public List<DecoratedSearchResultItem> selectBestResults(SearchParameters params,
|
||||||
ResultRankingContext rankingContext,
|
|
||||||
Collection<SearchResultItem> results) throws SQLException {
|
Collection<SearchResultItem> results) throws SQLException {
|
||||||
|
|
||||||
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
var domainCountFilter = new IndexResultDomainDeduplicator(params.limitByDomain);
|
||||||
@ -101,14 +118,13 @@ public class IndexResultValuatorService {
|
|||||||
item.resultsFromDomain = domainCountFilter.getCount(item);
|
item.resultsFromDomain = domainCountFilter.getCount(item);
|
||||||
}
|
}
|
||||||
|
|
||||||
return decorateAndRerank(resultsList, params.compiledQuery, rankingContext);
|
return decorateResults(resultsList, params.compiledQuery);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Decorate the result items with additional information from the link database
|
/** Decorate the result items with additional information from the link database
|
||||||
* and calculate an updated ranking with the additional information */
|
* and calculate an updated ranking with the additional information */
|
||||||
public List<DecoratedSearchResultItem> decorateAndRerank(List<SearchResultItem> rawResults,
|
public List<DecoratedSearchResultItem> decorateResults(List<SearchResultItem> rawResults,
|
||||||
CompiledQuery<String> compiledQuery,
|
CompiledQuery<String> compiledQuery)
|
||||||
ResultRankingContext rankingContext)
|
|
||||||
throws SQLException
|
throws SQLException
|
||||||
{
|
{
|
||||||
TLongList idsList = new TLongArrayList(rawResults.size());
|
TLongList idsList = new TLongArrayList(rawResults.size());
|
||||||
@ -131,42 +147,18 @@ public class IndexResultValuatorService {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reconstruct the compiledquery for re-valuation
|
|
||||||
//
|
|
||||||
// CAVEAT: This hinges on a very fragile that IndexResultValuationContext puts them in the same
|
|
||||||
// order as the data for the CompiledQuery<String>.
|
|
||||||
long[] wordMetas = new long[compiledQuery.size()];
|
|
||||||
|
|
||||||
for (int i = 0; i < compiledQuery.size(); i++) {
|
|
||||||
var score = result.keywordScores.get(i);
|
|
||||||
wordMetas[i] = score.encodedWordMetadata();
|
|
||||||
}
|
|
||||||
|
|
||||||
CompiledQueryLong metaQuery = new CompiledQueryLong(compiledQuery.root, new CqDataLong(wordMetas));
|
|
||||||
|
|
||||||
resultItems.add(createCombinedItem(
|
resultItems.add(createCombinedItem(
|
||||||
result,
|
result,
|
||||||
docData,
|
docData));
|
||||||
metaQuery,
|
|
||||||
rankingContext));
|
|
||||||
}
|
}
|
||||||
return resultItems;
|
return resultItems;
|
||||||
}
|
}
|
||||||
|
|
||||||
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
|
private DecoratedSearchResultItem createCombinedItem(SearchResultItem result,
|
||||||
DocdbUrlDetail docData,
|
DocdbUrlDetail docData) {
|
||||||
CompiledQueryLong wordMetas,
|
|
||||||
ResultRankingContext rankingContext) {
|
|
||||||
|
|
||||||
ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor();
|
ResultRankingDetailsExtractor detailsExtractor = new ResultRankingDetailsExtractor();
|
||||||
Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
|
// Consumer<ResultRankingDetails> detailConsumer = rankingContext.params.exportDebugData ? detailsExtractor::set : null;
|
||||||
|
|
||||||
double score = resultValuator.calculateSearchResultValue(wordMetas,
|
|
||||||
result.encodedDocMetadata,
|
|
||||||
result.htmlFeatures,
|
|
||||||
docData.wordsTotal(),
|
|
||||||
rankingContext,
|
|
||||||
detailConsumer);
|
|
||||||
|
|
||||||
return new DecoratedSearchResultItem(
|
return new DecoratedSearchResultItem(
|
||||||
result,
|
result,
|
||||||
@ -179,8 +171,8 @@ public class IndexResultValuatorService {
|
|||||||
docData.pubYear(),
|
docData.pubYear(),
|
||||||
docData.dataHash(),
|
docData.dataHash(),
|
||||||
docData.wordsTotal(),
|
docData.wordsTotal(),
|
||||||
bestPositions(wordMetas),
|
0L, //bestPositions(wordMetas),
|
||||||
score,
|
result.getScore(),
|
||||||
detailsExtractor.get()
|
detailsExtractor.get()
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
@ -1,26 +1,38 @@
|
|||||||
package nu.marginalia.index.results.model;
|
package nu.marginalia.index.results.model;
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
|
|
||||||
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
import it.unimi.dsi.fastutil.longs.Long2ObjectArrayMap;
|
||||||
|
import it.unimi.dsi.fastutil.longs.Long2ObjectOpenHashMap;
|
||||||
|
import nu.marginalia.index.positions.TermData;
|
||||||
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
import nu.marginalia.index.results.model.ids.DocMetadataList;
|
import nu.marginalia.index.results.model.ids.TermMetadataList;
|
||||||
import org.slf4j.Logger;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
|
||||||
public class TermMetadataForCombinedDocumentIds {
|
public class TermMetadataForCombinedDocumentIds {
|
||||||
private static final Logger logger = LoggerFactory.getLogger(TermMetadataForCombinedDocumentIds.class);
|
|
||||||
private final Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta;
|
private final Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta;
|
||||||
|
|
||||||
public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta) {
|
public TermMetadataForCombinedDocumentIds(Long2ObjectArrayMap<DocumentsWithMetadata> termdocToMeta) {
|
||||||
this.termdocToMeta = termdocToMeta;
|
this.termdocToMeta = termdocToMeta;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long getTermMetadata(long termId, long combinedId) {
|
public byte getTermMetadata(long termId, long combinedId) {
|
||||||
var metaByCombinedId = termdocToMeta.get(termId);
|
var metaByCombinedId = termdocToMeta.get(termId);
|
||||||
if (metaByCombinedId == null) {
|
if (metaByCombinedId == null) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
return metaByCombinedId.get(combinedId);
|
return metaByCombinedId.get(combinedId).flags();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Nullable
|
||||||
|
public GammaCodedSequence getPositions(long termId, long combinedId) {
|
||||||
|
var metaByCombinedId = termdocToMeta.get(termId);
|
||||||
|
|
||||||
|
if (metaByCombinedId == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return metaByCombinedId.get(combinedId).positions();
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean hasTermMeta(long termId, long combinedId) {
|
public boolean hasTermMeta(long termId, long combinedId) {
|
||||||
@ -30,16 +42,25 @@ public class TermMetadataForCombinedDocumentIds {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return metaByCombinedId.get(combinedId) != 0;
|
return metaByCombinedId.data().containsKey(combinedId);
|
||||||
}
|
}
|
||||||
|
|
||||||
public record DocumentsWithMetadata(Long2LongOpenHashMap data) {
|
public record DocumentsWithMetadata(Long2ObjectOpenHashMap<TermData> data) {
|
||||||
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, DocMetadataList metadata) {
|
public DocumentsWithMetadata(CombinedDocIdList combinedDocIdsAll, TermMetadataList metadata) {
|
||||||
this(new Long2LongOpenHashMap(combinedDocIdsAll.array(), metadata.array()));
|
this(new Long2ObjectOpenHashMap<>(combinedDocIdsAll.size()));
|
||||||
|
|
||||||
|
long[] ids = combinedDocIdsAll.array();
|
||||||
|
TermData[] data = metadata.array();
|
||||||
|
|
||||||
|
for (int i = 0; i < combinedDocIdsAll.size(); i++) {
|
||||||
|
if (data[i] != null) {
|
||||||
|
this.data.put(ids[i], data[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public long get(long combinedId) {
|
public TermData get(long combinedId) {
|
||||||
return data.getOrDefault(combinedId, 0);
|
return data.get(combinedId);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -15,6 +15,10 @@ import java.util.stream.LongStream;
|
|||||||
public final class CombinedDocIdList {
|
public final class CombinedDocIdList {
|
||||||
private final long[] data;
|
private final long[] data;
|
||||||
|
|
||||||
|
public CombinedDocIdList(long... data) {
|
||||||
|
this.data = Arrays.copyOf(data, data.length);
|
||||||
|
}
|
||||||
|
|
||||||
public CombinedDocIdList(LongArrayList data) {
|
public CombinedDocIdList(LongArrayList data) {
|
||||||
this.data = data.toLongArray();
|
this.data = data.toLongArray();
|
||||||
}
|
}
|
||||||
|
@ -1,45 +0,0 @@
|
|||||||
package nu.marginalia.index.results.model.ids;
|
|
||||||
|
|
||||||
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
|
||||||
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.stream.LongStream;
|
|
||||||
|
|
||||||
public final class DocMetadataList {
|
|
||||||
private final long[] array;
|
|
||||||
|
|
||||||
public DocMetadataList(long[] array) {
|
|
||||||
this.array = array;
|
|
||||||
}
|
|
||||||
|
|
||||||
public DocMetadataList(LongArrayList list) {
|
|
||||||
this(list.toLongArray());
|
|
||||||
}
|
|
||||||
|
|
||||||
public int size() {
|
|
||||||
return array.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
public LongStream stream() {
|
|
||||||
return LongStream.of(array);
|
|
||||||
}
|
|
||||||
|
|
||||||
public long[] array() {
|
|
||||||
return array;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj) {
|
|
||||||
if (obj == this) return true;
|
|
||||||
if (obj == null || obj.getClass() != this.getClass()) return false;
|
|
||||||
var that = (DocMetadataList) obj;
|
|
||||||
return Arrays.equals(this.array, that.array);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
return Arrays.hashCode(array);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -11,6 +11,7 @@ public final class TermIdList {
|
|||||||
|
|
||||||
public TermIdList(long[] array) {
|
public TermIdList(long[] array) {
|
||||||
this.array = array;
|
this.array = array;
|
||||||
|
Arrays.sort(this.array);
|
||||||
}
|
}
|
||||||
|
|
||||||
public TermIdList(LongArrayList list) {
|
public TermIdList(LongArrayList list) {
|
||||||
@ -29,6 +30,15 @@ public final class TermIdList {
|
|||||||
return array;
|
return array;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long at(int i) {
|
||||||
|
return array[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
public boolean contains(long id) {
|
||||||
|
// Implicitly sorted
|
||||||
|
return Arrays.binarySearch(array, id) >= 0;
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (obj == this) return true;
|
if (obj == this) return true;
|
||||||
|
@ -0,0 +1,55 @@
|
|||||||
|
package nu.marginalia.index.results.model.ids;
|
||||||
|
|
||||||
|
import nu.marginalia.index.positions.TermData;
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
|
||||||
|
import javax.annotation.Nullable;
|
||||||
|
import java.util.Arrays;
|
||||||
|
|
||||||
|
public final class TermMetadataList {
|
||||||
|
private final TermData[] array;
|
||||||
|
|
||||||
|
public TermMetadataList(TermData[] array) {
|
||||||
|
this.array = array;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int size() {
|
||||||
|
return array.length;
|
||||||
|
}
|
||||||
|
|
||||||
|
public long flag(int i) {
|
||||||
|
if (array[i] == null)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return array[i].flags();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the position data for the given document index,
|
||||||
|
* may be null if the term is not in the document
|
||||||
|
*/
|
||||||
|
@Nullable
|
||||||
|
public GammaCodedSequence position(int i) {
|
||||||
|
if (array[i] == null)
|
||||||
|
return null;
|
||||||
|
|
||||||
|
return array[i].positions();
|
||||||
|
}
|
||||||
|
|
||||||
|
public TermData[] array() {
|
||||||
|
return array;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (obj == this) return true;
|
||||||
|
if (obj == null || obj.getClass() != this.getClass()) return false;
|
||||||
|
var that = (TermMetadataList) obj;
|
||||||
|
return Arrays.equals(this.array, that.array);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return Arrays.hashCode(array);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -1,5 +1,7 @@
|
|||||||
package nu.marginalia.ranking.results;
|
package nu.marginalia.ranking.results;
|
||||||
|
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||||
@ -14,6 +16,7 @@ import nu.marginalia.ranking.results.factors.*;
|
|||||||
|
|
||||||
import com.google.inject.Inject;
|
import com.google.inject.Inject;
|
||||||
import com.google.inject.Singleton;
|
import com.google.inject.Singleton;
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
@ -33,15 +36,15 @@ public class ResultValuator {
|
|||||||
this.termCoherenceFactor = termCoherenceFactor;
|
this.termCoherenceFactor = termCoherenceFactor;
|
||||||
}
|
}
|
||||||
|
|
||||||
public double calculateSearchResultValue(CompiledQueryLong wordMeta,
|
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
||||||
long documentMetadata,
|
CompiledQueryInt positionsCountQuery, CompiledQuery<GammaCodedSequence> positionsQuery, long documentMetadata,
|
||||||
int features,
|
int features,
|
||||||
int length,
|
int length,
|
||||||
ResultRankingContext ctx,
|
ResultRankingContext ctx,
|
||||||
@Nullable Consumer<ResultRankingDetails> detailsConsumer
|
@Nullable Consumer<ResultRankingDetails> detailsConsumer
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
if (wordMeta.isEmpty())
|
if (wordFlagsQuery.isEmpty())
|
||||||
return Double.MAX_VALUE;
|
return Double.MAX_VALUE;
|
||||||
|
|
||||||
if (length < 0) {
|
if (length < 0) {
|
||||||
@ -82,12 +85,11 @@ public class ResultValuator {
|
|||||||
+ temporalBias
|
+ temporalBias
|
||||||
+ flagsPenalty;
|
+ flagsPenalty;
|
||||||
|
|
||||||
double tcfOverlap = rankingParams.tcfOverlapWeight * termCoherenceFactor.calculateOverlap(wordMeta);
|
// FIXME: need a weighting factor here
|
||||||
double tcfJaccard = rankingParams.tcfJaccardWeight * termCoherenceFactor.calculateAvgMutualJaccard(wordMeta, ctx);
|
double tcfAvgDist = 25. / termCoherenceFactor.calculateAvgMinDistance(positionsQuery, ctx);
|
||||||
|
|
||||||
double bM25F = rankingParams.bm25FullWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forRegular(rankingParams.fullParams, wordMeta.data, length, ctx));
|
double bM25F = rankingParams.bm25FullWeight * wordFlagsQuery.root.visit(new Bm25FullGraphVisitor(rankingParams.fullParams, positionsCountQuery.data, length, ctx));
|
||||||
double bM25N = rankingParams.bm25NgramWeight * wordMeta.root.visit(Bm25FullGraphVisitor.forNgrams(rankingParams.fullParams, wordMeta.data, length, ctx));
|
double bM25P = rankingParams.bm25PrioWeight * wordFlagsQuery.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordFlagsQuery.data, ctx));
|
||||||
double bM25P = rankingParams.bm25PrioWeight * wordMeta.root.visit(new Bm25PrioGraphVisitor(rankingParams.prioParams, wordMeta.data, ctx));
|
|
||||||
|
|
||||||
double overallPartPositive = Math.max(0, overallPart);
|
double overallPartPositive = Math.max(0, overallPart);
|
||||||
double overallPartNegative = -Math.min(0, overallPart);
|
double overallPartNegative = -Math.min(0, overallPart);
|
||||||
@ -112,10 +114,10 @@ public class ResultValuator {
|
|||||||
temporalBias,
|
temporalBias,
|
||||||
flagsPenalty,
|
flagsPenalty,
|
||||||
overallPart,
|
overallPart,
|
||||||
tcfOverlap,
|
0,
|
||||||
tcfJaccard,
|
0,
|
||||||
bM25F,
|
bM25F,
|
||||||
bM25N,
|
0, // FIXME: Remove from model
|
||||||
bM25P)
|
bM25P)
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -125,8 +127,8 @@ public class ResultValuator {
|
|||||||
// Renormalize to 0...15, where 0 is the best possible score;
|
// Renormalize to 0...15, where 0 is the best possible score;
|
||||||
// this is a historical artifact of the original ranking function
|
// this is a historical artifact of the original ranking function
|
||||||
double ret = normalize(
|
double ret = normalize(
|
||||||
tcfOverlap + tcfJaccard
|
tcfAvgDist
|
||||||
+ bM25F + bM25P + bM25N
|
+ bM25F + bM25P
|
||||||
+ overallPartPositive,
|
+ overallPartPositive,
|
||||||
overallPartNegative);
|
overallPartNegative);
|
||||||
|
|
||||||
|
@ -13,7 +13,7 @@ import java.util.List;
|
|||||||
public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
||||||
private static final long AVG_LENGTH = 5000;
|
private static final long AVG_LENGTH = 5000;
|
||||||
|
|
||||||
private final CqDataLong wordMetaData;
|
private final CqDataInt counts;
|
||||||
private final CqDataInt frequencies;
|
private final CqDataInt frequencies;
|
||||||
private final Bm25Parameters bm25Parameters;
|
private final Bm25Parameters bm25Parameters;
|
||||||
|
|
||||||
@ -22,31 +22,16 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
|||||||
|
|
||||||
private final BitSet mask;
|
private final BitSet mask;
|
||||||
|
|
||||||
private Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
|
public Bm25FullGraphVisitor(Bm25Parameters bm25Parameters,
|
||||||
CqDataLong wordMetaData,
|
CqDataInt counts,
|
||||||
int length,
|
int length,
|
||||||
BitSet mask,
|
|
||||||
ResultRankingContext ctx) {
|
ResultRankingContext ctx) {
|
||||||
this.length = length;
|
this.length = length;
|
||||||
this.bm25Parameters = bm25Parameters;
|
this.bm25Parameters = bm25Parameters;
|
||||||
this.docCount = ctx.termFreqDocCount();
|
this.docCount = ctx.termFreqDocCount();
|
||||||
this.wordMetaData = wordMetaData;
|
this.counts = counts;
|
||||||
this.frequencies = ctx.fullCounts;
|
this.frequencies = ctx.fullCounts;
|
||||||
this.mask = mask;
|
this.mask = ctx.regularMask;
|
||||||
}
|
|
||||||
|
|
||||||
public static Bm25FullGraphVisitor forRegular(Bm25Parameters bm25Parameters,
|
|
||||||
CqDataLong wordMetaData,
|
|
||||||
int length,
|
|
||||||
ResultRankingContext ctx) {
|
|
||||||
return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.regularMask, ctx);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Bm25FullGraphVisitor forNgrams(Bm25Parameters bm25Parameters,
|
|
||||||
CqDataLong wordMetaData,
|
|
||||||
int length,
|
|
||||||
ResultRankingContext ctx) {
|
|
||||||
return new Bm25FullGraphVisitor(bm25Parameters, wordMetaData, length, ctx.ngramsMask, ctx);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
@ -73,7 +58,7 @@ public class Bm25FullGraphVisitor implements CqExpression.DoubleVisitor {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
double count = Long.bitCount(WordMetadata.decodePositions(wordMetaData.get(idx)));
|
double count = counts.get(idx);
|
||||||
|
|
||||||
int freq = frequencies.get(idx);
|
int freq = frequencies.get(idx);
|
||||||
|
|
||||||
|
@ -1,66 +1,44 @@
|
|||||||
package nu.marginalia.ranking.results.factors;
|
package nu.marginalia.ranking.results.factors;
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
import nu.marginalia.sequence.SequenceOperations;
|
||||||
|
|
||||||
/** Rewards documents where terms appear frequently within the same sentences
|
/** Rewards documents where terms appear frequently within the same sentences
|
||||||
*/
|
*/
|
||||||
public class TermCoherenceFactor {
|
public class TermCoherenceFactor {
|
||||||
|
|
||||||
/** Calculate a factor that rewards the best total position overlap
|
public double calculateAvgMinDistance(CompiledQuery<GammaCodedSequence> positions, ResultRankingContext ctx) {
|
||||||
* between the terms in the query. This is high when all the terms
|
|
||||||
* found in the same sentences.
|
|
||||||
*/
|
|
||||||
public double calculateOverlap(CompiledQueryLong wordMetadataQuery) {
|
|
||||||
if (wordMetadataQuery.size() < 2)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(wordMetadataQuery,
|
|
||||||
score -> score >>> WordMetadata.POSITIONS_SHIFT);
|
|
||||||
|
|
||||||
return bitsSetFactor(mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Calculate a factor that rewards the best average mutual Jaccard index
|
|
||||||
* between the terms in the query. This is high when the several terms are frequently
|
|
||||||
* found in the same sentences.
|
|
||||||
*/
|
|
||||||
public double calculateAvgMutualJaccard(CompiledQueryLong wordMetadataQuery, ResultRankingContext ctx) {
|
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
int cnt = 0;
|
int cnt = 0;
|
||||||
|
|
||||||
for (int i = 0; i < wordMetadataQuery.size(); i++) {
|
for (int i = 0; i < positions.size(); i++) {
|
||||||
|
|
||||||
// Skip terms that are not in the regular mask
|
// Skip terms that are not in the regular mask
|
||||||
if (!ctx.regularMask.get(i))
|
if (!ctx.regularMask.get(i))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
long imask = WordMetadata.decodePositions(wordMetadataQuery.at(i));
|
var posi = positions.at(i);
|
||||||
|
|
||||||
// Skip terms that are not in the document
|
// Skip terms that are not in the document
|
||||||
if (imask == 0L)
|
if (posi == null)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
for (int j = i + 1; j < wordMetadataQuery.size(); j++) {
|
for (int j = i + 1; j < positions.size(); j++) {
|
||||||
|
|
||||||
// Skip terms that are not in the regular mask
|
// Skip terms that are not in the regular mask
|
||||||
if (!ctx.regularMask.get(j))
|
if (!ctx.regularMask.get(j))
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
long jmask = WordMetadata.decodePositions(wordMetadataQuery.at(j));
|
var posj = positions.at(j);
|
||||||
|
|
||||||
// Skip terms that are not in the document
|
// Skip terms that are not in the document
|
||||||
if (jmask == 0L)
|
if (posj == null)
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
long quot = Long.bitCount(imask & jmask);
|
int distance = SequenceOperations.minDistance(posi.iterator(), posj.iterator());
|
||||||
long rem = Long.bitCount(imask | jmask);
|
sum += distance;
|
||||||
|
|
||||||
// rem is always > 0 because imask and jmask are not both 0
|
|
||||||
|
|
||||||
sum += quot/(double) rem;
|
|
||||||
cnt++;
|
cnt++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -68,15 +46,8 @@ public class TermCoherenceFactor {
|
|||||||
if (cnt > 0) {
|
if (cnt > 0) {
|
||||||
return sum / cnt;
|
return sum / cnt;
|
||||||
} else {
|
} else {
|
||||||
return 0;
|
return 1000.;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
double bitsSetFactor(long mask) {
|
|
||||||
final int bitsSetInMask = Long.bitCount(mask);
|
|
||||||
|
|
||||||
return Math.pow(bitsSetInMask/(double) WordMetadata.POSITIONS_COUNT, 0.25);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
382
code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java
Normal file
382
code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java
Normal file
@ -0,0 +1,382 @@
|
|||||||
|
package nu.marginalia.index;
|
||||||
|
|
||||||
|
import com.google.inject.Guice;
|
||||||
|
import com.google.inject.Inject;
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongArrayList;
|
||||||
|
import it.unimi.dsi.fastutil.longs.LongList;
|
||||||
|
import nu.marginalia.IndexLocations;
|
||||||
|
import nu.marginalia.array.page.LongQueryBuffer;
|
||||||
|
import nu.marginalia.hash.MurmurHash3_128;
|
||||||
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
|
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||||
|
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||||
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
|
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||||
|
import nu.marginalia.index.index.CombinedIndexReader;
|
||||||
|
import nu.marginalia.index.index.StatefulIndex;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
|
import nu.marginalia.index.journal.reader.IndexJournalReader;
|
||||||
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
|
import nu.marginalia.index.positions.TermData;
|
||||||
|
import nu.marginalia.index.results.model.ids.CombinedDocIdList;
|
||||||
|
import nu.marginalia.linkdb.docs.DocumentDbReader;
|
||||||
|
import nu.marginalia.linkdb.docs.DocumentDbWriter;
|
||||||
|
import nu.marginalia.linkdb.model.DocdbUrlDetail;
|
||||||
|
import nu.marginalia.model.EdgeUrl;
|
||||||
|
import nu.marginalia.model.id.UrlIdCodec;
|
||||||
|
import nu.marginalia.model.idx.DocumentFlags;
|
||||||
|
import nu.marginalia.model.idx.DocumentMetadata;
|
||||||
|
import nu.marginalia.model.idx.WordFlags;
|
||||||
|
import nu.marginalia.model.idx.WordMetadata;
|
||||||
|
import nu.marginalia.process.control.FakeProcessHeartbeat;
|
||||||
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
|
import nu.marginalia.service.control.ServiceHeartbeat;
|
||||||
|
import nu.marginalia.service.server.Initialization;
|
||||||
|
import nu.marginalia.storage.FileStorageService;
|
||||||
|
import org.junit.jupiter.api.AfterEach;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.parallel.Execution;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.lang.foreign.Arena;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.sql.SQLException;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
|
import static nu.marginalia.linkdb.LinkdbFileNames.DOCDB_FILE_NAME;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.parallel.ExecutionMode.SAME_THREAD;
|
||||||
|
|
||||||
|
@Execution(SAME_THREAD)
|
||||||
|
public class CombinedIndexReaderTest {
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
Initialization initialization;
|
||||||
|
|
||||||
|
IndexQueryServiceIntegrationTestModule testModule;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
StatefulIndex statefulIndex;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
IndexJournalWriter indexJournalWriter;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
FileStorageService fileStorageService;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
DomainRankings domainRankings;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
ProcessHeartbeat processHeartbeat;
|
||||||
|
@Inject
|
||||||
|
DocumentDbReader documentDbReader;
|
||||||
|
|
||||||
|
@Inject
|
||||||
|
IndexFactory indexFactory;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException {
|
||||||
|
|
||||||
|
testModule = new IndexQueryServiceIntegrationTestModule();
|
||||||
|
Guice.createInjector(testModule).injectMembers(this);
|
||||||
|
|
||||||
|
initialization.setReady();
|
||||||
|
}
|
||||||
|
|
||||||
|
@AfterEach
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
testModule.cleanUp();
|
||||||
|
}
|
||||||
|
|
||||||
|
private final MockDocumentMeta anyMetadata = new MockDocumentMeta(0, new DocumentMetadata(2, 0, 14, EnumSet.noneOf(DocumentFlags.class)));
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimpleRetrieval() throws Exception {
|
||||||
|
new MockData().add(
|
||||||
|
d(1, 1),
|
||||||
|
anyMetadata,
|
||||||
|
w("hello", WordFlags.Title, 33, 55),
|
||||||
|
w("world", WordFlags.Subjects, 34)
|
||||||
|
).load();
|
||||||
|
|
||||||
|
var reader = indexFactory.getCombinedIndexReader();
|
||||||
|
var query = reader.findFullWord(kw("hello")).build();
|
||||||
|
|
||||||
|
var buffer = new LongQueryBuffer(32);
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
List.of(d(1, 1)),
|
||||||
|
decode(buffer)
|
||||||
|
);
|
||||||
|
|
||||||
|
var helloMeta = td(reader, kw("hello"), d(1, 1));
|
||||||
|
assertEquals(helloMeta.flags(), WordFlags.Title.asBit());
|
||||||
|
assertEquals(IntList.of(33, 55), helloMeta.positions().values());
|
||||||
|
|
||||||
|
var worldMeta = td(reader, kw("world"), d(1, 1));
|
||||||
|
assertEquals(worldMeta.flags(), WordFlags.Subjects.asBit());
|
||||||
|
assertEquals(IntList.of(34), worldMeta.positions().values());
|
||||||
|
}
|
||||||
|
|
||||||
|
TermData td(CombinedIndexReader reader, long wordId, MockDataDocument docId) {
|
||||||
|
return (reader.getTermMetadata(Arena.global(), wordId, new CombinedDocIdList(docId.docId())).array())[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testUnionRetrieval() throws Exception {
|
||||||
|
new MockData()
|
||||||
|
.add(
|
||||||
|
d(1, 1),
|
||||||
|
anyMetadata,
|
||||||
|
w("hello", WordFlags.Title),
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(1, 2),
|
||||||
|
anyMetadata,
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(1, 3),
|
||||||
|
anyMetadata,
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(2, 4),
|
||||||
|
anyMetadata,
|
||||||
|
w("hello", WordFlags.Title),
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.load();
|
||||||
|
|
||||||
|
var reader = indexFactory.getCombinedIndexReader();
|
||||||
|
var query = reader
|
||||||
|
.findFullWord(kw("hello"))
|
||||||
|
.also(kw("world"))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var buffer = new LongQueryBuffer(32);
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
List.of(d(1, 1), d(2, 4)),
|
||||||
|
decode(buffer)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testNotFilterRetrieval() throws Exception {
|
||||||
|
new MockData()
|
||||||
|
.add(
|
||||||
|
d(1, 1),
|
||||||
|
anyMetadata,
|
||||||
|
w("hello", WordFlags.Title),
|
||||||
|
w("world", WordFlags.Title),
|
||||||
|
w("goodbye", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(1, 2),
|
||||||
|
anyMetadata,
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(1, 3),
|
||||||
|
anyMetadata,
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.add(
|
||||||
|
d(2, 4),
|
||||||
|
anyMetadata,
|
||||||
|
w("hello", WordFlags.Title),
|
||||||
|
w("world", WordFlags.Title)
|
||||||
|
)
|
||||||
|
.load();
|
||||||
|
|
||||||
|
var reader = indexFactory.getCombinedIndexReader();
|
||||||
|
var query = reader.findFullWord(kw("hello"))
|
||||||
|
.also(kw("world"))
|
||||||
|
.not(kw("goodbye"))
|
||||||
|
.build();
|
||||||
|
|
||||||
|
var buffer = new LongQueryBuffer(32);
|
||||||
|
query.getMoreResults(buffer);
|
||||||
|
|
||||||
|
assertEquals(
|
||||||
|
List.of(d(2, 4)),
|
||||||
|
decode(buffer)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
List<MockDataDocument> decode(LongQueryBuffer buffer) {
|
||||||
|
List<MockDataDocument> result = new ArrayList<>();
|
||||||
|
for (int i = 0; i < buffer.size(); i++) {
|
||||||
|
result.add(new MockDataDocument(buffer.data.get(i)));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
private MockDataDocument d(int domainId, int ordinal) {
|
||||||
|
return new MockDataDocument(domainId, ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void constructIndex() throws IOException {
|
||||||
|
createForwardIndex();
|
||||||
|
createFullReverseIndex();
|
||||||
|
createPrioReverseIndex();
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createFullReverseIndex() throws IOException {
|
||||||
|
|
||||||
|
Path outputFileDocs = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.DOCS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFileWords = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.WORDS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFilePositions = ReverseIndexFullFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexFullFileNames.FileIdentifier.POSITIONS, ReverseIndexFullFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
|
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||||
|
Path tmpDir = workDir.resolve("tmp");
|
||||||
|
|
||||||
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
|
var constructor =
|
||||||
|
new ReverseIndexConstructor(
|
||||||
|
outputFileDocs,
|
||||||
|
outputFileWords,
|
||||||
|
outputFilePositions,
|
||||||
|
IndexJournalReader::singleFile,
|
||||||
|
DocIdRewriter.identity(),
|
||||||
|
tmpDir);
|
||||||
|
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createPrioReverseIndex() throws IOException {
|
||||||
|
|
||||||
|
Path outputFileDocs = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.DOCS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFileWords = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.WORDS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFilePositions = ReverseIndexPrioFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ReverseIndexPrioFileNames.FileIdentifier.POSITIONS, ReverseIndexPrioFileNames.FileVersion.NEXT);
|
||||||
|
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||||
|
Path tmpDir = workDir.resolve("tmp");
|
||||||
|
|
||||||
|
if (!Files.isDirectory(tmpDir)) Files.createDirectories(tmpDir);
|
||||||
|
|
||||||
|
var constructor = new ReverseIndexConstructor(
|
||||||
|
outputFileDocs,
|
||||||
|
outputFileWords,
|
||||||
|
outputFilePositions,
|
||||||
|
IndexJournalReader::singleFile,
|
||||||
|
DocIdRewriter.identity(),
|
||||||
|
tmpDir);
|
||||||
|
|
||||||
|
constructor.createReverseIndex(new FakeProcessHeartbeat(), "name", workDir);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void createForwardIndex() throws IOException {
|
||||||
|
|
||||||
|
Path workDir = IndexLocations.getIndexConstructionArea(fileStorageService);
|
||||||
|
Path outputFileDocsId = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_ID, ForwardIndexFileNames.FileVersion.NEXT);
|
||||||
|
Path outputFileDocsData = ForwardIndexFileNames.resolve(IndexLocations.getCurrentIndex(fileStorageService), ForwardIndexFileNames.FileIdentifier.DOC_DATA, ForwardIndexFileNames.FileVersion.NEXT);
|
||||||
|
|
||||||
|
ForwardIndexConverter converter = new ForwardIndexConverter(processHeartbeat,
|
||||||
|
IndexJournalReader.paging(workDir),
|
||||||
|
outputFileDocsId,
|
||||||
|
outputFileDocsData,
|
||||||
|
domainRankings
|
||||||
|
);
|
||||||
|
|
||||||
|
converter.convert();
|
||||||
|
}
|
||||||
|
|
||||||
|
MurmurHash3_128 hasher = new MurmurHash3_128();
|
||||||
|
|
||||||
|
long kw(String s) {
|
||||||
|
return hasher.hashKeyword(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
class MockData {
|
||||||
|
private final Map<Long, List<MockDataKeyword>> allData = new HashMap<>();
|
||||||
|
private final Map<Long, MockDocumentMeta> metaByDoc = new HashMap<>();
|
||||||
|
|
||||||
|
public MockData add(MockDataDocument document,
|
||||||
|
MockDocumentMeta meta,
|
||||||
|
MockDataKeyword... words)
|
||||||
|
{
|
||||||
|
long id = UrlIdCodec.encodeId(document.domainId, document.ordinal);
|
||||||
|
|
||||||
|
allData.computeIfAbsent(id, l -> new ArrayList<>()).addAll(List.of(words));
|
||||||
|
metaByDoc.put(id, meta);
|
||||||
|
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
void load() throws IOException, SQLException, URISyntaxException {
|
||||||
|
allData.forEach((doc, words) -> {
|
||||||
|
|
||||||
|
var meta = metaByDoc.get(doc);
|
||||||
|
|
||||||
|
var header = new IndexJournalEntryHeader(
|
||||||
|
doc,
|
||||||
|
meta.features,
|
||||||
|
100,
|
||||||
|
meta.documentMetadata.encode()
|
||||||
|
);
|
||||||
|
|
||||||
|
String[] keywords = words.stream().map(w -> w.keyword).toArray(String[]::new);
|
||||||
|
long[] metadata = words.stream().map(w -> w.termMetadata).mapToLong(Long::longValue).toArray();
|
||||||
|
var positions = words.stream().map(w -> w.positions).map(pos -> GammaCodedSequence.generate(ByteBuffer.allocate(1024), pos.toIntArray())).toArray(GammaCodedSequence[]::new);
|
||||||
|
|
||||||
|
indexJournalWriter.put(header,
|
||||||
|
new IndexJournalEntryData(keywords, metadata, positions));
|
||||||
|
});
|
||||||
|
|
||||||
|
var linkdbWriter = new DocumentDbWriter(
|
||||||
|
IndexLocations.getLinkdbLivePath(fileStorageService).resolve(DOCDB_FILE_NAME)
|
||||||
|
);
|
||||||
|
for (Long key : allData.keySet()) {
|
||||||
|
linkdbWriter.add(new DocdbUrlDetail(
|
||||||
|
key,
|
||||||
|
new EdgeUrl("https://www.example.com"),
|
||||||
|
"test",
|
||||||
|
"test",
|
||||||
|
0.,
|
||||||
|
"HTML5",
|
||||||
|
0,
|
||||||
|
null,
|
||||||
|
0,
|
||||||
|
5
|
||||||
|
));
|
||||||
|
}
|
||||||
|
linkdbWriter.close();
|
||||||
|
|
||||||
|
indexJournalWriter.close();
|
||||||
|
constructIndex();
|
||||||
|
documentDbReader.reconnect();
|
||||||
|
statefulIndex.switchIndex();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record MockDataDocument(int domainId, int ordinal) {
|
||||||
|
public MockDataDocument(long encodedId) {
|
||||||
|
this(UrlIdCodec.getDomainId(encodedId), UrlIdCodec.getDocumentOrdinal(encodedId));
|
||||||
|
}
|
||||||
|
|
||||||
|
public long docId() {
|
||||||
|
return UrlIdCodec.encodeId(domainId, ordinal);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
record MockDocumentMeta(int features, DocumentMetadata documentMetadata) {}
|
||||||
|
record MockDataKeyword(String keyword, long termMetadata, IntList positions) {}
|
||||||
|
|
||||||
|
MockDataKeyword w(String keyword, WordFlags flags, int... positions) {
|
||||||
|
return new MockDataKeyword(keyword, new WordMetadata(0L, EnumSet.of(flags)).encode(), IntList.of(positions));
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
@ -13,7 +13,6 @@ import nu.marginalia.process.control.FakeProcessHeartbeat;
|
|||||||
import nu.marginalia.process.control.ProcessHeartbeat;
|
import nu.marginalia.process.control.ProcessHeartbeat;
|
||||||
import nu.marginalia.sequence.GammaCodedSequence;
|
import nu.marginalia.sequence.GammaCodedSequence;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
|
||||||
import nu.marginalia.index.construction.DocIdRewriter;
|
import nu.marginalia.index.construction.DocIdRewriter;
|
||||||
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
import nu.marginalia.index.construction.ReverseIndexConstructor;
|
||||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||||
@ -142,6 +141,53 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
Assertions.assertArrayEquals(ids, actual);
|
Assertions.assertArrayEquals(ids, actual);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSimple() throws Exception {
|
||||||
|
var linkdbWriter = new DocumentDbWriter(
|
||||||
|
IndexLocations.getLinkdbLivePath(fileStorageService)
|
||||||
|
.resolve(DOCDB_FILE_NAME)
|
||||||
|
);
|
||||||
|
for (int i = 1; i < 512; i++) {
|
||||||
|
loadData(linkdbWriter, i);
|
||||||
|
}
|
||||||
|
linkdbWriter.close();
|
||||||
|
documentDbReader.reconnect();
|
||||||
|
|
||||||
|
indexJournalWriter.close();
|
||||||
|
constructIndex();
|
||||||
|
statefulIndex.switchIndex();
|
||||||
|
|
||||||
|
var rsp = queryService.justQuery(
|
||||||
|
SearchSpecification.builder()
|
||||||
|
.queryLimits(new QueryLimits(10, 10, Integer.MAX_VALUE, 4000))
|
||||||
|
.queryStrategy(QueryStrategy.SENTENCE)
|
||||||
|
.year(SpecificationLimit.none())
|
||||||
|
.quality(SpecificationLimit.none())
|
||||||
|
.size(SpecificationLimit.none())
|
||||||
|
.rank(SpecificationLimit.none())
|
||||||
|
.rankingParams(ResultRankingParameters.sensibleDefaults())
|
||||||
|
.domains(new ArrayList<>())
|
||||||
|
.searchSetIdentifier("NONE")
|
||||||
|
.query(
|
||||||
|
SearchQuery.builder("2")
|
||||||
|
.include("2")
|
||||||
|
.build()
|
||||||
|
).build()
|
||||||
|
);
|
||||||
|
|
||||||
|
int[] idxes = new int[] { 62, 222, 382, 60, 124, 220, 284, 380, 444, 122 };
|
||||||
|
long[] ids = IntStream.of(idxes).mapToLong(Long::valueOf).toArray();
|
||||||
|
long[] actual = rsp.results
|
||||||
|
.stream()
|
||||||
|
.mapToLong(i -> i.rawIndexResult.getDocumentId())
|
||||||
|
.map(UrlIdCodec::getDocumentOrdinal)
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
System.out.println(Arrays.toString(actual));
|
||||||
|
System.out.println(Arrays.toString(ids));
|
||||||
|
Assertions.assertArrayEquals(ids, actual);
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testDomainQuery() throws Exception {
|
public void testDomainQuery() throws Exception {
|
||||||
|
|
||||||
@ -297,7 +343,6 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
return UrlIdCodec.encodeId((32 - (id % 32)), id);
|
||||||
}
|
}
|
||||||
|
|
||||||
MurmurHash3_128 hasher = new MurmurHash3_128();
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void loadData(DocumentDbWriter ldbw, int id) {
|
public void loadData(DocumentDbWriter ldbw, int id) {
|
||||||
int[] factors = IntStream
|
int[] factors = IntStream
|
||||||
@ -305,22 +350,44 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
.filter(v -> (id % v) == 0)
|
.filter(v -> (id % v) == 0)
|
||||||
.toArray();
|
.toArray();
|
||||||
|
|
||||||
|
System.out.println("id:" + id + " factors: " + Arrays.toString(factors));
|
||||||
|
|
||||||
long fullId = fullId(id);
|
long fullId = fullId(id);
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, new DocumentMetadata(0, 0, 0, 0, id % 5, id, id % 20, (byte) 0).encode());
|
||||||
|
|
||||||
long[] data = new long[factors.length * 2];
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
data[2 * i] = hasher.hashNearlyASCII(Integer.toString(factors[i]));
|
|
||||||
data[2 * i + 1] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
|
||||||
}
|
|
||||||
|
|
||||||
ldbw.add(new DocdbUrlDetail(
|
ldbw.add(new DocdbUrlDetail(
|
||||||
fullId, new EdgeUrl("https://www.example.com/"+id),
|
fullId, new EdgeUrl("https://www.example.com/"+id),
|
||||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||||
));
|
));
|
||||||
|
|
||||||
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
|
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
|
||||||
|
long[] metadata = new long[factors.length];
|
||||||
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||||
|
}
|
||||||
|
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(32);
|
||||||
|
for (int i = 0; i < factors.length; i++) {
|
||||||
|
positions[i] = GammaCodedSequence.generate(wa, factors);
|
||||||
|
}
|
||||||
|
|
||||||
|
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||||
|
}
|
||||||
|
|
||||||
|
@SneakyThrows
|
||||||
|
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
|
||||||
|
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
||||||
|
long fullId = UrlIdCodec.encodeId(domain, id);
|
||||||
|
var header = new IndexJournalEntryHeader(factors.length, 0, 100, fullId, DocumentMetadata.defaultValue());
|
||||||
|
|
||||||
|
ldbw.add(new DocdbUrlDetail(
|
||||||
|
fullId, new EdgeUrl("https://www.example.com/"+id),
|
||||||
|
"test", "test", 0., "HTML5", 0, null, 0, 10
|
||||||
|
));
|
||||||
|
|
||||||
|
|
||||||
|
String[] keywords = IntStream.of(factors).mapToObj(Integer::toString).toArray(String[]::new);
|
||||||
long[] metadata = new long[factors.length];
|
long[] metadata = new long[factors.length];
|
||||||
for (int i = 0; i < factors.length; i++) {
|
for (int i = 0; i < factors.length; i++) {
|
||||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
||||||
@ -334,30 +401,4 @@ public class IndexQueryServiceIntegrationSmokeTest {
|
|||||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
|
||||||
public void loadDataWithDomain(DocumentDbWriter ldbw, int domain, int id) {
|
|
||||||
int[] factors = IntStream.rangeClosed(1, id).filter(v -> (id % v) == 0).toArray();
|
|
||||||
long fullId = UrlIdCodec.encodeId(domain, id);
|
|
||||||
var header = new IndexJournalEntryHeader(factors.length, 0, fullId, DocumentMetadata.defaultValue());
|
|
||||||
|
|
||||||
ldbw.add(new DocdbUrlDetail(
|
|
||||||
fullId, new EdgeUrl("https://www.example.com/"+id),
|
|
||||||
"test", "test", 0., "HTML5", 0, null, 0, 10
|
|
||||||
));
|
|
||||||
|
|
||||||
|
|
||||||
String[] keywords = IntStream.range(0, factors.length).mapToObj(Integer::toString).toArray(String[]::new);
|
|
||||||
long[] metadata = new long[factors.length];
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
metadata[i] = new WordMetadata(i, EnumSet.of(WordFlags.Title)).encode();
|
|
||||||
}
|
|
||||||
GammaCodedSequence[] positions = new GammaCodedSequence[factors.length];
|
|
||||||
ByteBuffer wa = ByteBuffer.allocate(16);
|
|
||||||
for (int i = 0; i < factors.length; i++) {
|
|
||||||
positions[i] = GammaCodedSequence.generate(wa, i);
|
|
||||||
}
|
|
||||||
|
|
||||||
indexJournalWriter.put(header, new IndexJournalEntryData(keywords, metadata, positions));
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -565,6 +565,7 @@ public class IndexQueryServiceIntegrationTest {
|
|||||||
var header = new IndexJournalEntryHeader(
|
var header = new IndexJournalEntryHeader(
|
||||||
doc,
|
doc,
|
||||||
meta.features,
|
meta.features,
|
||||||
|
100,
|
||||||
meta.documentMetadata.encode()
|
meta.documentMetadata.encode()
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -1,100 +0,0 @@
|
|||||||
package nu.marginalia.ranking.results;
|
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CqDataInt;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
|
||||||
import nu.marginalia.model.idx.DocumentFlags;
|
|
||||||
import nu.marginalia.model.idx.WordFlags;
|
|
||||||
import nu.marginalia.model.crawl.PubDate;
|
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
|
||||||
import nu.marginalia.ranking.results.factors.*;
|
|
||||||
import nu.marginalia.term_frequency_dict.TermFrequencyDict;
|
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
import org.mockito.Mockito;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import static org.mockito.Mockito.when;
|
|
||||||
|
|
||||||
class ResultValuatorTest {
|
|
||||||
|
|
||||||
TermFrequencyDict dict;
|
|
||||||
ResultValuator valuator;
|
|
||||||
|
|
||||||
@BeforeEach
|
|
||||||
public void setUp() {
|
|
||||||
|
|
||||||
dict = Mockito.mock(TermFrequencyDict.class);
|
|
||||||
when(dict.docCount()).thenReturn(100_000);
|
|
||||||
|
|
||||||
valuator = new ResultValuator(
|
|
||||||
new TermCoherenceFactor()
|
|
||||||
);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
CqDataInt frequencyData = new CqDataInt(new int[] { 10 });
|
|
||||||
|
|
||||||
CompiledQueryLong titleOnlyLowCountSet = CompiledQuery.just(
|
|
||||||
new SearchResultKeywordScore("bob", 1,
|
|
||||||
wordMetadata(Set.of(1), EnumSet.of(WordFlags.Title)))
|
|
||||||
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);
|
|
||||||
|
|
||||||
CompiledQueryLong highCountNoTitleSet = CompiledQuery.just(
|
|
||||||
new SearchResultKeywordScore("bob", 1,
|
|
||||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh)))
|
|
||||||
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
|
|
||||||
|
|
||||||
CompiledQueryLong highCountSubjectSet = CompiledQuery.just(
|
|
||||||
new SearchResultKeywordScore("bob", 1,
|
|
||||||
wordMetadata(Set.of(1,3,4,6,7,9,10,11,12,14,15,16), EnumSet.of(WordFlags.TfIdfHigh, WordFlags.Subjects)))
|
|
||||||
).mapToLong(SearchResultKeywordScore::encodedWordMetadata);;
|
|
||||||
|
|
||||||
|
|
||||||
@Test
|
|
||||||
void evaluateTerms() {
|
|
||||||
|
|
||||||
when(dict.getTermFreq("bob")).thenReturn(10);
|
|
||||||
ResultRankingContext context = new ResultRankingContext(100000,
|
|
||||||
ResultRankingParameters.sensibleDefaults(),
|
|
||||||
new BitSet(),
|
|
||||||
new BitSet(),
|
|
||||||
frequencyData,
|
|
||||||
frequencyData);
|
|
||||||
|
|
||||||
long docMeta = docMetadata(0, 2010, 5, EnumSet.noneOf(DocumentFlags.class));
|
|
||||||
int features = 0;
|
|
||||||
|
|
||||||
double titleOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null);
|
|
||||||
double titleLongOnlyLowCount = valuator.calculateSearchResultValue(titleOnlyLowCountSet, docMeta, features, 10_000, context, null);
|
|
||||||
double highCountNoTitle = valuator.calculateSearchResultValue(highCountNoTitleSet, docMeta, features, 10_000, context, null);
|
|
||||||
double highCountSubject = valuator.calculateSearchResultValue(highCountSubjectSet, docMeta, features, 10_000, context, null);
|
|
||||||
|
|
||||||
System.out.println(titleOnlyLowCount);
|
|
||||||
System.out.println(titleLongOnlyLowCount);
|
|
||||||
System.out.println(highCountNoTitle);
|
|
||||||
System.out.println(highCountSubject);
|
|
||||||
}
|
|
||||||
|
|
||||||
private long docMetadata(int topology,
|
|
||||||
int year,
|
|
||||||
int quality,
|
|
||||||
EnumSet<DocumentFlags> flags) {
|
|
||||||
return new DocumentMetadata(topology, PubDate.toYearByte(year), quality, flags).encode();
|
|
||||||
}
|
|
||||||
|
|
||||||
private long wordMetadata(Set<Integer> positions, Set<WordFlags> wordFlags) {
|
|
||||||
long posBits = positions.stream()
|
|
||||||
.mapToLong(i -> ((1L << i) & 0xFF_FFFF_FFFF_FFFFL))
|
|
||||||
.reduce((a,b) -> a|b)
|
|
||||||
.orElse(0L);
|
|
||||||
|
|
||||||
return new WordMetadata(posBits, wordFlags).encode();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
@ -1,107 +0,0 @@
|
|||||||
package nu.marginalia.ranking.results.factors;
|
|
||||||
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
|
||||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates;
|
|
||||||
import nu.marginalia.api.searchquery.model.results.SearchResultKeywordScore;
|
|
||||||
import nu.marginalia.bbpc.BrailleBlockPunchCards;
|
|
||||||
import nu.marginalia.model.idx.WordMetadata;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
|
||||||
|
|
||||||
class TermCoherenceFactorTest {
|
|
||||||
|
|
||||||
TermCoherenceFactor termCoherenceFactor = new TermCoherenceFactor();
|
|
||||||
@Test
|
|
||||||
public void testAllBitsSet() {
|
|
||||||
var allPositionsSet = createSet(
|
|
||||||
~0L,
|
|
||||||
~0L
|
|
||||||
);
|
|
||||||
|
|
||||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(
|
|
||||||
allPositionsSet,
|
|
||||||
SearchResultKeywordScore::positions
|
|
||||||
);
|
|
||||||
|
|
||||||
assertEquals(1.0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
|
||||||
|
|
||||||
assertEquals(1.0,
|
|
||||||
termCoherenceFactor.calculateOverlap(
|
|
||||||
allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testNoBitsSet() {
|
|
||||||
var allPositionsSet = createSet(
|
|
||||||
0, 0
|
|
||||||
);
|
|
||||||
|
|
||||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(allPositionsSet, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
|
||||||
|
|
||||||
assertEquals(0, termCoherenceFactor.bitsSetFactor(mask), 0.01);
|
|
||||||
|
|
||||||
assertEquals(0, termCoherenceFactor.calculateOverlap(allPositionsSet.mapToLong(SearchResultKeywordScore::encodedWordMetadata)));
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test @SuppressWarnings("unchecked")
|
|
||||||
public void testLowPosMatches() {
|
|
||||||
var positions = createSet(
|
|
||||||
List.of(0, 1, 2, 3), List.of(0, 1, 2, 3)
|
|
||||||
);
|
|
||||||
|
|
||||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
|
||||||
printMask(mask);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test @SuppressWarnings("unchecked")
|
|
||||||
public void testHiPosMatches() {
|
|
||||||
var positions = createSet(
|
|
||||||
List.of(55, 54, 53, 52), List.of(55, 54, 53, 52)
|
|
||||||
);
|
|
||||||
|
|
||||||
long mask = CompiledQueryAggregates.longBitmaskAggregate(positions, score -> score.positions() & WordMetadata.POSITIONS_MASK);
|
|
||||||
printMask(mask);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testBitMatchScaling() {
|
|
||||||
for (int i = 1; i < 48; i++) {
|
|
||||||
System.out.println(i + ":" + termCoherenceFactor.bitsSetFactor((1L << i) - 1));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void printMask(long mask) {
|
|
||||||
System.out.println(BrailleBlockPunchCards.printBits(mask, 48));
|
|
||||||
}
|
|
||||||
|
|
||||||
CompiledQuery<SearchResultKeywordScore> createSet(List<Integer>... maskPositions) {
|
|
||||||
long[] positions = new long[maskPositions.length];
|
|
||||||
|
|
||||||
for (int i = 0; i < maskPositions.length; i++) {
|
|
||||||
for (long pos : maskPositions[i]) {
|
|
||||||
positions[i] |= (1L<<pos);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return createSet(positions);
|
|
||||||
}
|
|
||||||
|
|
||||||
CompiledQuery<SearchResultKeywordScore> createSet(long... positionMasks) {
|
|
||||||
List<SearchResultKeywordScore> keywords = new ArrayList<>();
|
|
||||||
|
|
||||||
for (int i = 0; i < positionMasks.length; i++) {
|
|
||||||
keywords.add(new SearchResultKeywordScore("", 0,
|
|
||||||
new WordMetadata(positionMasks[i] & WordMetadata.POSITIONS_MASK, (byte) 0).encode()));
|
|
||||||
}
|
|
||||||
|
|
||||||
return CompiledQuery.just(keywords.toArray(SearchResultKeywordScore[]::new));
|
|
||||||
}
|
|
||||||
}
|
|
@ -17,12 +17,13 @@ public class EliasGammaCodec implements IntIterator {
|
|||||||
|
|
||||||
private final BitReader reader;
|
private final BitReader reader;
|
||||||
int rem = 0;
|
int rem = 0;
|
||||||
private int last = 0;
|
private int last;
|
||||||
private int next = 0;
|
private int next = 0;
|
||||||
|
|
||||||
private EliasGammaCodec(ByteBuffer buffer) {
|
private EliasGammaCodec(ByteBuffer buffer, int zero) {
|
||||||
reader = new BitReader(buffer);
|
reader = new BitReader(buffer);
|
||||||
|
|
||||||
|
last = zero;
|
||||||
int bits = reader.takeWhileZero();
|
int bits = reader.takeWhileZero();
|
||||||
|
|
||||||
if (!reader.hasMore()) {
|
if (!reader.hasMore()) {
|
||||||
@ -33,9 +34,24 @@ public class EliasGammaCodec implements IntIterator {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static int readCount(ByteBuffer buffer) {
|
||||||
|
var reader = new BitReader(buffer);
|
||||||
|
|
||||||
|
if (reader.getCurrentValue() > 0) {
|
||||||
|
int bits = reader.takeWhileZero();
|
||||||
|
return reader.get(bits);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
|
/** Decode a sequence of integers from a ByteBuffer using the Elias Gamma code */
|
||||||
public static IntIterator decode(ByteBuffer buffer) {
|
public static IntIterator decode(ByteBuffer buffer) {
|
||||||
return new EliasGammaCodec(buffer);
|
return new EliasGammaCodec(buffer, 0);
|
||||||
|
}
|
||||||
|
public static IntIterator decodeWithOffset(ByteBuffer buffer, int offset) {
|
||||||
|
return new EliasGammaCodec(buffer, offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code.
|
/** Encode a sequence of integers into a ByteBuffer using the Elias Gamma code.
|
||||||
|
@ -16,6 +16,7 @@ import java.util.StringJoiner;
|
|||||||
* */
|
* */
|
||||||
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
|
public class GammaCodedSequence implements BinarySerializable, Iterable<Integer> {
|
||||||
private final ByteBuffer raw;
|
private final ByteBuffer raw;
|
||||||
|
|
||||||
int startPos = 0;
|
int startPos = 0;
|
||||||
int startLimit = 0;
|
int startLimit = 0;
|
||||||
|
|
||||||
@ -43,6 +44,12 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
startLimit = bytes.limit();
|
startLimit = bytes.limit();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public GammaCodedSequence(ByteBuffer bytes, int startPos, int startLimit) {
|
||||||
|
this.raw = bytes;
|
||||||
|
this.startPos = startPos;
|
||||||
|
this.startLimit = startLimit;
|
||||||
|
}
|
||||||
|
|
||||||
public GammaCodedSequence(byte[] bytes) {
|
public GammaCodedSequence(byte[] bytes) {
|
||||||
raw = ByteBuffer.allocate(bytes.length);
|
raw = ByteBuffer.allocate(bytes.length);
|
||||||
raw.put(bytes);
|
raw.put(bytes);
|
||||||
@ -72,6 +79,18 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
return EliasGammaCodec.decode(raw);
|
return EliasGammaCodec.decode(raw);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Return an iterator over the sequence with a constant offset applied to each value.
|
||||||
|
* This is useful for comparing sequences with different offsets, and adds zero
|
||||||
|
* extra cost to the decoding process which is already based on adding
|
||||||
|
* relative differences.
|
||||||
|
* */
|
||||||
|
public IntIterator offsetIterator(int offset) {
|
||||||
|
raw.position(startPos);
|
||||||
|
raw.limit(startLimit);
|
||||||
|
|
||||||
|
return EliasGammaCodec.decodeWithOffset(raw, offset);
|
||||||
|
}
|
||||||
|
|
||||||
public IntList values() {
|
public IntList values() {
|
||||||
var intItr = iterator();
|
var intItr = iterator();
|
||||||
IntArrayList ret = new IntArrayList(8);
|
IntArrayList ret = new IntArrayList(8);
|
||||||
@ -81,18 +100,6 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Decode the sequence into an IntList;
|
|
||||||
* this is a somewhat slow operation,
|
|
||||||
* iterating over the data directly more performant */
|
|
||||||
public IntList decode() {
|
|
||||||
IntArrayList ret = new IntArrayList(8);
|
|
||||||
var iter = iterator();
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
ret.add(iter.nextInt());
|
|
||||||
}
|
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
return raw.hashCode();
|
return raw.hashCode();
|
||||||
}
|
}
|
||||||
@ -116,7 +123,11 @@ public class GammaCodedSequence implements BinarySerializable, Iterable<Integer>
|
|||||||
return raw;
|
return raw;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int size() {
|
public int bufferSize() {
|
||||||
return raw.capacity();
|
return raw.capacity();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public int valueCount() {
|
||||||
|
return EliasGammaCodec.readCount(buffer());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,86 @@
|
|||||||
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
|
||||||
|
public class SequenceOperations {
|
||||||
|
|
||||||
|
/** Return true if the sequences intersect, false otherwise.
|
||||||
|
* */
|
||||||
|
public static boolean intersectSequences(IntIterator... sequences) {
|
||||||
|
|
||||||
|
if (sequences.length <= 1)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
// Initialize values and find the maximum value
|
||||||
|
int[] values = new int[sequences.length];
|
||||||
|
|
||||||
|
for (int i = 0; i < sequences.length; i++) {
|
||||||
|
if (sequences[i].hasNext())
|
||||||
|
values[i] = sequences[i].nextInt();
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Intersect the sequences by advancing all values smaller than the maximum seen so far
|
||||||
|
// until they are equal to the maximum value, or until the end of the sequence is reached
|
||||||
|
int max = Integer.MIN_VALUE;
|
||||||
|
int successes = 0;
|
||||||
|
for (int i = 0; successes < sequences.length; i = (i + 1) % sequences.length)
|
||||||
|
{
|
||||||
|
if (values[i] == max) {
|
||||||
|
successes++;
|
||||||
|
} else {
|
||||||
|
successes = 0;
|
||||||
|
|
||||||
|
// Discard values until we reach the maximum value seen so far,
|
||||||
|
// or until the end of the sequence is reached
|
||||||
|
while (values[i] < max) {
|
||||||
|
if (sequences[i].hasNext())
|
||||||
|
values[i] = sequences[i].nextInt();
|
||||||
|
else
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update the maximum value, if necessary
|
||||||
|
max = Math.max(max, values[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
|
||||||
|
* */
|
||||||
|
public static int minDistance(IntIterator seqA, IntIterator seqB)
|
||||||
|
{
|
||||||
|
int minDistance = Integer.MAX_VALUE;
|
||||||
|
|
||||||
|
if (!seqA.hasNext() || !seqB.hasNext())
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
int a = seqA.nextInt();
|
||||||
|
int b = seqB.nextInt();
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
int distance = Math.abs(a - b);
|
||||||
|
if (distance < minDistance)
|
||||||
|
minDistance = distance;
|
||||||
|
|
||||||
|
if (a <= b) {
|
||||||
|
if (seqA.hasNext()) {
|
||||||
|
a = seqA.nextInt();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (seqB.hasNext()) {
|
||||||
|
b = seqB.nextInt();
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return minDistance;
|
||||||
|
}
|
||||||
|
}
|
@ -20,6 +20,10 @@ public class BitReader {
|
|||||||
this.currentValue = 0;
|
this.currentValue = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public long getCurrentValue() {
|
||||||
|
return currentValue;
|
||||||
|
}
|
||||||
|
|
||||||
/** Read the next bit from the buffer */
|
/** Read the next bit from the buffer */
|
||||||
public boolean getBit() {
|
public boolean getBit() {
|
||||||
if (bitPosition <= 0) {
|
if (bitPosition <= 0) {
|
||||||
|
@ -0,0 +1,75 @@
|
|||||||
|
package nu.marginalia.sequence;
|
||||||
|
|
||||||
|
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import java.nio.ByteBuffer;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
class SequenceOperationsTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesSingle() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||||
|
|
||||||
|
assertTrue(SequenceOperations.intersectSequences(seq1.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesTrivialMatch() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 1);
|
||||||
|
|
||||||
|
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesTrivialMismatch() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2);
|
||||||
|
|
||||||
|
assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesOffsetMatch() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 3);
|
||||||
|
|
||||||
|
assertTrue(SequenceOperations.intersectSequences(seq1.offsetIterator(0), seq2.offsetIterator(-2)));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesDeepMatch() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14);
|
||||||
|
|
||||||
|
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesDeepMatch3() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 14);
|
||||||
|
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9);
|
||||||
|
|
||||||
|
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
void intersectSequencesDeepMismatch() {
|
||||||
|
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||||
|
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 11);
|
||||||
|
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 14);
|
||||||
|
|
||||||
|
assertFalse(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator()));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -26,6 +26,8 @@ public class DocumentRecordKeywordsProjection {
|
|||||||
public int htmlFeatures;
|
public int htmlFeatures;
|
||||||
public long documentMetadata;
|
public long documentMetadata;
|
||||||
|
|
||||||
|
public int length;
|
||||||
|
|
||||||
public List<String> words;
|
public List<String> words;
|
||||||
public TLongList metas;
|
public TLongList metas;
|
||||||
public List<GammaCodedSequence> positions;
|
public List<GammaCodedSequence> positions;
|
||||||
@ -39,13 +41,14 @@ public class DocumentRecordKeywordsProjection {
|
|||||||
}
|
}
|
||||||
|
|
||||||
public static Collection<String> requiredColumns() {
|
public static Collection<String> requiredColumns() {
|
||||||
return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata");
|
return List.of("domain", "ordinal", "htmlFeatures", "word", "wordMeta", "documentMetadata", "length");
|
||||||
}
|
}
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public DocumentRecordKeywordsProjection add(String heading, Object value) {
|
public DocumentRecordKeywordsProjection add(String heading, Object value) {
|
||||||
switch (heading) {
|
switch (heading) {
|
||||||
case "domain" -> domain = (String) value;
|
case "domain" -> domain = (String) value;
|
||||||
|
case "length" -> length = (Integer) value;
|
||||||
case "ordinal" -> ordinal = (Integer) value;
|
case "ordinal" -> ordinal = (Integer) value;
|
||||||
case "htmlFeatures" -> htmlFeatures = (Integer) value;
|
case "htmlFeatures" -> htmlFeatures = (Integer) value;
|
||||||
case "documentMetadata" -> documentMetadata = (Long) value;
|
case "documentMetadata" -> documentMetadata = (Long) value;
|
||||||
|
@ -6,12 +6,10 @@ import lombok.SneakyThrows;
|
|||||||
import nu.marginalia.IndexLocations;
|
import nu.marginalia.IndexLocations;
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
import nu.marginalia.index.journal.model.IndexJournalEntryData;
|
||||||
import nu.marginalia.storage.FileStorageService;
|
import nu.marginalia.storage.FileStorageService;
|
||||||
import nu.marginalia.hash.MurmurHash3_128;
|
|
||||||
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
import nu.marginalia.index.journal.model.IndexJournalEntryHeader;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
import nu.marginalia.index.journal.writer.IndexJournalWriterPagingImpl;
|
||||||
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
import nu.marginalia.index.journal.writer.IndexJournalWriter;
|
||||||
import nu.marginalia.keyword.model.DocumentKeywords;
|
import nu.marginalia.keyword.model.DocumentKeywords;
|
||||||
import nu.marginalia.model.idx.DocumentMetadata;
|
|
||||||
import nu.marginalia.index.journal.IndexJournalFileNames;
|
import nu.marginalia.index.journal.IndexJournalFileNames;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
@ -41,18 +39,11 @@ public class LoaderIndexJournalWriter {
|
|||||||
indexWriter = new IndexJournalWriterPagingImpl(indexArea);
|
indexWriter = new IndexJournalWriterPagingImpl(indexArea);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void putWords(long combinedId,
|
|
||||||
int features,
|
|
||||||
DocumentMetadata metadata,
|
|
||||||
DocumentKeywords wordSet) {
|
|
||||||
|
|
||||||
putWords(combinedId, features, metadata.encode(), wordSet);
|
|
||||||
}
|
|
||||||
|
|
||||||
@SneakyThrows
|
@SneakyThrows
|
||||||
public void putWords(long combinedId,
|
public void putWords(long combinedId,
|
||||||
int features,
|
int features,
|
||||||
long metadata,
|
long metadata,
|
||||||
|
int length,
|
||||||
DocumentKeywords wordSet) {
|
DocumentKeywords wordSet) {
|
||||||
|
|
||||||
if (wordSet.isEmpty()) {
|
if (wordSet.isEmpty()) {
|
||||||
@ -65,7 +56,7 @@ public class LoaderIndexJournalWriter {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
var header = new IndexJournalEntryHeader(combinedId, features, metadata);
|
var header = new IndexJournalEntryHeader(combinedId, features, length, metadata);
|
||||||
var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions);
|
var data = new IndexJournalEntryData(wordSet.keywords, wordSet.metadata, wordSet.positions);
|
||||||
|
|
||||||
indexWriter.put(header, data);
|
indexWriter.put(header, data);
|
||||||
|
@ -75,6 +75,7 @@ public class KeywordLoaderService {
|
|||||||
writer.putWords(combinedId,
|
writer.putWords(combinedId,
|
||||||
projection.htmlFeatures,
|
projection.htmlFeatures,
|
||||||
projection.documentMetadata,
|
projection.documentMetadata,
|
||||||
|
projection.length,
|
||||||
words);
|
words);
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -91,7 +91,7 @@ public class SearchServicePaperDoll extends AbstractModule {
|
|||||||
long positions)
|
long positions)
|
||||||
{
|
{
|
||||||
results.add(new DecoratedSearchResultItem(
|
results.add(new DecoratedSearchResultItem(
|
||||||
new SearchResultItem(url.hashCode(), 2, 3, false),
|
new SearchResultItem(url.hashCode(), 2, 3),
|
||||||
new EdgeUrl(url),
|
new EdgeUrl(url),
|
||||||
title,
|
title,
|
||||||
description,
|
description,
|
||||||
|
Loading…
Reference in New Issue
Block a user