(index) Experimental initial integration of document spans into index

This commit is contained in:
Viktor Lofgren 2024-07-30 12:01:53 +02:00
parent 80900107f7
commit b316b55be9
29 changed files with 394 additions and 162 deletions

View File

@ -59,13 +59,4 @@ public class CompiledQueryAggregates {
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
} }
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
return query.root().visit(new CqPositionsOperator(query, operator));
}
} }

View File

@ -1,85 +0,0 @@
package nu.marginalia.api.searchquery.model.compiled.aggregate;
import it.unimi.dsi.fastutil.longs.LongArraySet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongSet;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
import java.util.List;
import java.util.function.IntToLongFunction;
import java.util.function.LongUnaryOperator;
import java.util.function.ToLongFunction;
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
private final IntToLongFunction operator;
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
this.operator = idx -> operator.applyAsLong(query.at(idx));
}
@Override
public LongSet onAnd(List<? extends CqExpression> parts) {
LongSet ret = new LongArraySet();
for (var part : parts) {
ret = comineSets(ret, part.visit(this));
}
return ret;
}
private LongSet comineSets(LongSet a, LongSet b) {
if (a.isEmpty())
return b;
if (b.isEmpty())
return a;
LongSet ret = newSet(a.size() * b.size());
var ai = a.longIterator();
while (ai.hasNext()) {
long aval = ai.nextLong();
var bi = b.longIterator();
while (bi.hasNext()) {
ret.add(aval & bi.nextLong());
}
}
return ret;
}
@Override
public LongSet onOr(List<? extends CqExpression> parts) {
LongSet ret = newSet(parts.size());
for (var part : parts) {
ret.addAll(part.visit(this));
}
return ret;
}
@Override
public LongSet onLeaf(int idx) {
var set = newSet(1);
set.add(operator.applyAsLong(idx));
return set;
}
/** Allocate a new set suitable for a collection with the provided cardinality */
private LongSet newSet(int cardinality) {
if (cardinality < 8)
return new LongArraySet(cardinality);
else
return new LongOpenHashSet(cardinality);
}
}

View File

@ -17,6 +17,7 @@ dependencies {
implementation project(':code:libraries:btree') implementation project(':code:libraries:btree')
implementation project(':code:libraries:slop') implementation project(':code:libraries:slop')
implementation project(':code:libraries:coded-sequence') implementation project(':code:libraries:coded-sequence')
implementation project(':code:libraries:language-processing')
implementation project(':code:index:query') implementation project(':code:index:query')
implementation project(':code:index:index-journal') implementation project(':code:index:index-journal')
implementation project(':code:common:model') implementation project(':code:common:model')

View File

@ -1,6 +1,6 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
class ForwardIndexParameters { public class ForwardIndexParameters {
public static final int ENTRY_SIZE = 3; public static final int ENTRY_SIZE = 3;
public static final int METADATA_OFFSET = 0; public static final int METADATA_OFFSET = 0;
public static final int FEATURES_OFFSET = 1; public static final int FEATURES_OFFSET = 1;

View File

@ -3,11 +3,14 @@ package nu.marginalia.index.forward;
import gnu.trove.map.hash.TLongIntHashMap; import gnu.trove.map.hash.TLongIntHashMap;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import java.io.IOException; import java.io.IOException;
import java.lang.foreign.Arena;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
@ -30,6 +33,7 @@ public class ForwardIndexReader {
private final LongArray data; private final LongArray data;
private final ForwardIndexSpansReader spansReader; private final ForwardIndexSpansReader spansReader;
private final Logger logger = LoggerFactory.getLogger(getClass()); private final Logger logger = LoggerFactory.getLogger(getClass());
public ForwardIndexReader(Path idsFile, public ForwardIndexReader(Path idsFile,
@ -121,6 +125,21 @@ public class ForwardIndexReader {
return idToOffset.get(docId); return idToOffset.get(docId);
} }
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
long offset = idxForDoc(docId);
if (offset < 0) return new DocumentSpans();
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
try {
return spansReader.readSpans(arena, encodedOffset);
}
catch (IOException ex) {
logger.error("Failed to read spans for doc " + docId, ex);
return new DocumentSpans();
}
}
public int totalDocCount() { public int totalDocCount() {
return idToOffset.size(); return idToOffset.size();

View File

@ -1,9 +1,11 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward.construction;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArray;
import nu.marginalia.array.LongArrayFactory; import nu.marginalia.array.LongArrayFactory;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexParameters;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
import nu.marginalia.model.idx.DocumentMetadata; import nu.marginalia.model.idx.DocumentMetadata;

View File

@ -0,0 +1,77 @@
package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations;
public class DocumentSpan {
/** A list of the interlaced start and end positions of each span in the document of this type */
private final CodedSequence startsEnds;
public DocumentSpan(CodedSequence startsEnds) {
this.startsEnds = startsEnds;
}
public DocumentSpan() {
this.startsEnds = null;
}
public boolean containsPosition(int position) {
if (startsEnds == null) {
return false;
}
var iter = startsEnds.iterator();
while (iter.hasNext()) {
int start = iter.nextInt();
if (start > position) {
return false;
}
int end = iter.nextInt();
if (end > position) {
return true;
}
}
return false;
}
public boolean containsRange(int rangeStart, int len) {
if (startsEnds == null) {
return false;
}
var iter = startsEnds.iterator();
while (iter.hasNext()) {
int start = iter.nextInt();
if (start > rangeStart) {
return false;
}
int end = iter.nextInt();
if (end > rangeStart + len) {
return true;
}
}
return false;
}
public boolean overlapsRange(CodedSequence sequence) {
return SequenceOperations.intersectSequences(iterator(), sequence.iterator());
}
/** Returns an iterator over the start and end positions of each span in the document of this type */
public IntIterator iterator() {
if (null == startsEnds) {
return IntList.of().iterator();
}
return startsEnds.iterator();
}
public int size() {
return startsEnds.valueCount() / 2;
}
}

View File

@ -0,0 +1,35 @@
package nu.marginalia.index.forward.spans;
import nu.marginalia.language.sentence.tag.HtmlTag;
import nu.marginalia.sequence.CodedSequence;
public class DocumentSpans {
private static DocumentSpan EMPTY_SPAN = new DocumentSpan();
public DocumentSpan title = EMPTY_SPAN;
public DocumentSpan heading = EMPTY_SPAN;
public DocumentSpan nav = EMPTY_SPAN;
public DocumentSpan pageHeader = EMPTY_SPAN;
public DocumentSpan pageFooter = EMPTY_SPAN;
public DocumentSpan code = EMPTY_SPAN;
public DocumentSpan pre = EMPTY_SPAN;
void accept(byte code, CodedSequence positions) {
if (code == HtmlTag.HEADING.code)
this.heading = new DocumentSpan(positions);
else if (code == HtmlTag.TITLE.code)
this.title = new DocumentSpan(positions);
else if (code == HtmlTag.NAV.code)
this.nav = new DocumentSpan(positions);
else if (code == HtmlTag.PAGE_HEADER.code)
this.pageHeader = new DocumentSpan(positions);
else if (code == HtmlTag.PAGE_FOOTER.code)
this.pageFooter = new DocumentSpan(positions);
else if (code == HtmlTag.CODE.code)
this.code = new DocumentSpan(positions);
else if (code == HtmlTag.PRE.code)
this.pre = new DocumentSpan(positions);
}
}

View File

@ -1,6 +1,5 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward.spans;
import it.unimi.dsi.fastutil.ints.IntList;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import java.io.IOException; import java.io.IOException;
@ -9,8 +8,6 @@ import java.nio.channels.FileChannel;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import java.nio.file.StandardOpenOption; import java.nio.file.StandardOpenOption;
import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("preview") @SuppressWarnings("preview")
public class ForwardIndexSpansReader implements AutoCloseable { public class ForwardIndexSpansReader implements AutoCloseable {
@ -20,9 +17,9 @@ public class ForwardIndexSpansReader implements AutoCloseable {
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ); this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
} }
public List<SpanData> readSpans(Arena arena, long encodedOffset) throws IOException { public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
long size = encodedOffset & 0xFFF_FFFF; long size = SpansCodec.decodeSize(encodedOffset);
long offset = encodedOffset >>> 28; long offset = SpansCodec.decodeStartOffset(encodedOffset);
var buffer = arena.allocate(size).asByteBuffer(); var buffer = arena.allocate(size).asByteBuffer();
buffer.clear(); buffer.clear();
@ -33,22 +30,16 @@ public class ForwardIndexSpansReader implements AutoCloseable {
int count = buffer.get(); int count = buffer.get();
List<SpanData> ret = new ArrayList<>(); DocumentSpans ret = new DocumentSpans();
while (count-- > 0) { while (count-- > 0) {
byte code = buffer.get(); byte code = buffer.get();
short len = buffer.getShort(); short len = buffer.getShort();
final int pos = buffer.position(); ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
// Decode the gamma-coded sequence; this will advance the buffer position
// in a not entirely predictable way, so we need to save the position
buffer.limit(buffer.position() + len);
var sequence = new GammaCodedSequence(buffer).values();
ret.add(new SpanData(code, sequence));
// Reset the buffer position to the end of the span // Reset the buffer position to the end of the span
buffer.position(pos + len); buffer.position(buffer.position() + len);
buffer.limit(buffer.capacity());
} }
return ret; return ret;
@ -59,5 +50,4 @@ public class ForwardIndexSpansReader implements AutoCloseable {
spansFileChannel.close(); spansFileChannel.close();
} }
public record SpanData(byte code, IntList data) {}
} }

View File

@ -1,4 +1,4 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward.spans;
import java.io.IOException; import java.io.IOException;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
@ -42,8 +42,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
} }
public long endRecord() { public long endRecord() {
return stateStartOffset << 28 | stateLength; return SpansCodec.encode(stateStartOffset, stateLength);
} }
@Override @Override

View File

@ -0,0 +1,17 @@
package nu.marginalia.index.forward.spans;
public class SpansCodec {
public static long encode(long startOffset, long size) {
assert size < 0x1000_0000L : "Size must be less than 2^28";
return startOffset << 28 | (size & 0xFFF_FFFFL);
}
public static long decodeStartOffset(long encoded) {
return encoded >>> 28;
}
public static long decodeSize(long encoded) {
return encoded & 0x0FFF_FFFFL;
}
}

View File

@ -2,6 +2,7 @@ package nu.marginalia.index.forward;
import lombok.SneakyThrows; import lombok.SneakyThrows;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.journal.IndexJournalSlopWriter;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;

View File

@ -1,6 +1,7 @@
package nu.marginalia.index.forward; package nu.marginalia.index.forward;
import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
import nu.marginalia.sequence.GammaCodedSequence; import nu.marginalia.sequence.GammaCodedSequence;
import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@ -11,7 +12,7 @@ import java.nio.ByteBuffer;
import java.nio.file.Files; import java.nio.file.Files;
import java.nio.file.Path; import java.nio.file.Path;
import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.*;
class ForwardIndexSpansReaderTest { class ForwardIndexSpansReaderTest {
Path testFile = Files.createTempFile("test", ".idx"); Path testFile = Files.createTempFile("test", ".idx");
@ -32,12 +33,12 @@ class ForwardIndexSpansReaderTest {
long offset2; long offset2;
try (var writer = new ForwardIndexSpansWriter(testFile)) { try (var writer = new ForwardIndexSpansWriter(testFile)) {
writer.beginRecord(1); writer.beginRecord(1);
writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer()); writer.writeSpan((byte) 'h', GammaCodedSequence.generate(wa, 1, 3, 5, 8).buffer());
offset1 = writer.endRecord(); offset1 = writer.endRecord();
writer.beginRecord(2); writer.beginRecord(2);
writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer()); writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 2, 4, 6, 7).buffer());
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer()); writer.writeSpan((byte) 'p', GammaCodedSequence.generate(wa, 3, 5).buffer());
offset2 = writer.endRecord(); offset2 = writer.endRecord();
} }
@ -47,17 +48,21 @@ class ForwardIndexSpansReaderTest {
var spans1 = reader.readSpans(arena, offset1); var spans1 = reader.readSpans(arena, offset1);
var spans2 = reader.readSpans(arena, offset2); var spans2 = reader.readSpans(arena, offset2);
assertEquals(1, spans1.size()); assertEquals(2, spans1.heading.size());
assertEquals('a', spans1.get(0).code()); assertEquals(2, spans2.code.size());
assertEquals(IntList.of(1, 3, 5), spans1.get(0).data());
assertEquals(2, spans2.size()); assertFalse(spans2.code.containsPosition(1));
assertTrue(spans2.code.containsPosition(3));
assertFalse(spans2.code.containsPosition(5));
assertTrue(spans2.code.containsPosition(6));
assertFalse(spans2.code.containsPosition(7));
assertFalse(spans2.code.containsPosition(8));
assertEquals('b', spans2.get(0).code()); assertEquals(1, spans2.pre.size());
assertEquals(IntList.of(2, 4, 6), spans2.get(0).data());
assertEquals('c', spans2.get(1).code()); assertEquals(0, spans2.pageFooter.size());
assertEquals(IntList.of(3, 5, 7), spans2.get(1).data()); assertFalse(spans2.pageFooter.containsPosition(8));
} }
} }
} }

View File

@ -317,6 +317,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
try { try {
executeSearch(); executeSearch();
} }
catch (Exception ex) {
logger.error("Error in index lookup", ex);
}
finally { finally {
synchronized (remainingIndexTasks) { synchronized (remainingIndexTasks) {
if (remainingIndexTasks.decrementAndGet() == 0) { if (remainingIndexTasks.decrementAndGet() == 0) {

View File

@ -8,6 +8,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre
import nu.marginalia.index.FullReverseIndexReader; import nu.marginalia.index.FullReverseIndexReader;
import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.PrioReverseIndexReader;
import nu.marginalia.index.forward.ForwardIndexReader; import nu.marginalia.index.forward.ForwardIndexReader;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.model.SearchTerms;
import nu.marginalia.index.query.IndexQuery; import nu.marginalia.index.query.IndexQuery;
@ -186,11 +187,17 @@ public class CombinedIndexReader {
/** Retrieves the HTML features for the specified document */ /** Retrieves the HTML features for the specified document */
public int getHtmlFeatures(long docId) { public int getHtmlFeatures(long docId) {
return forwardIndexReader.getHtmlFeatures(docId); return forwardIndexReader.getHtmlFeatures(docId);
} /** Retrieves the HTML features for the specified document */ }
/** Retrieves the HTML features for the specified document */
public int getDocumentSize(long docId) { public int getDocumentSize(long docId) {
return forwardIndexReader.getDocumentSize(docId); return forwardIndexReader.getDocumentSize(docId);
} }
/** Retrieves the document spans for the specified document */
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
return forwardIndexReader.getDocumentSpans(arena, docId);
}
/** Close the indexes (this is not done immediately) /** Close the indexes (this is not done immediately)
* */ * */

View File

@ -98,7 +98,7 @@ public class IndexResultRankingService {
} }
// Calculate the preliminary score // Calculate the preliminary score
var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions); var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions);
if (score != null) { if (score != null) {
results.add(score); results.add(score);
} }

View File

@ -3,15 +3,18 @@ package nu.marginalia.index.results;
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
import nu.marginalia.api.searchquery.model.compiled.aggregate.CqDoubleSumOperator;
import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
import nu.marginalia.api.searchquery.model.results.SearchResultItem; import nu.marginalia.api.searchquery.model.results.SearchResultItem;
import nu.marginalia.index.forward.spans.DocumentSpans;
import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.QueryParams;
import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchParameters;
import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.query.limit.QueryStrategy;
import nu.marginalia.index.results.model.QuerySearchTerms; import nu.marginalia.index.results.model.QuerySearchTerms;
import nu.marginalia.index.results.model.TermCoherenceGroupList;
import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.HtmlFeature;
import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.crawl.PubDate;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;
@ -22,6 +25,7 @@ import nu.marginalia.sequence.CodedSequence;
import nu.marginalia.sequence.SequenceOperations; import nu.marginalia.sequence.SequenceOperations;
import javax.annotation.Nullable; import javax.annotation.Nullable;
import java.lang.foreign.Arena;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
@ -50,7 +54,8 @@ public class IndexResultScoreCalculator {
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
@Nullable @Nullable
public SearchResultItem calculateScore(long combinedId, public SearchResultItem calculateScore(Arena arena,
long combinedId,
QuerySearchTerms searchTerms, QuerySearchTerms searchTerms,
long[] wordFlags, long[] wordFlags,
CodedSequence[] positions) CodedSequence[] positions)
@ -78,8 +83,7 @@ public class IndexResultScoreCalculator {
long docMetadata = index.getDocumentMetadata(docId); long docMetadata = index.getDocumentMetadata(docId);
int htmlFeatures = index.getHtmlFeatures(docId); int htmlFeatures = index.getHtmlFeatures(docId);
int docSize = index.getDocumentSize(docId); int docSize = index.getDocumentSize(docId);
DocumentSpans spans = index.getDocumentSpans(arena, docId);
int bestCoherence = searchTerms.coherences.testOptional(positions);
double score = calculateSearchResultValue( double score = calculateSearchResultValue(
wordFlagsQuery, wordFlagsQuery,
@ -88,7 +92,9 @@ public class IndexResultScoreCalculator {
docMetadata, docMetadata,
htmlFeatures, htmlFeatures,
docSize, docSize,
bestCoherence, spans,
positions,
searchTerms.coherences,
rankingContext); rankingContext);
SearchResultItem searchResult = new SearchResultItem(docId, SearchResultItem searchResult = new SearchResultItem(docId,
@ -169,10 +175,13 @@ public class IndexResultScoreCalculator {
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
CompiledQueryInt positionsCountQuery, CompiledQueryInt positionsCountQuery,
CompiledQuery<CodedSequence> positionsQuery, long documentMetadata, CompiledQuery<CodedSequence> positionsQuery,
long documentMetadata,
int features, int features,
int length, int length,
int bestCoherence, DocumentSpans spans,
CodedSequence[] positions,
TermCoherenceGroupList coherences,
ResultRankingContext ctx) ResultRankingContext ctx)
{ {
if (length < 0) { if (length < 0) {
@ -205,6 +214,33 @@ public class IndexResultScoreCalculator {
temporalBias = 0; temporalBias = 0;
} }
int numCoherenceAll = coherences.countOptional(positions);
int bestCoherenceAll = coherences.testOptional(positions);
int bestCoherenceTitle = coherences.testOptional(positions, spans.title);
int bestCoherenceHeading = coherences.testOptional(positions, spans.heading);
double spanWeightedScore = positionsQuery.root.visit(new CqDoubleSumOperator(positionsQuery, termPos -> {
if (termPos == null)
return 0;
if (spans.title.overlapsRange(termPos))
return 5.0;
if (spans.heading.overlapsRange(termPos))
return 2.5;
if (spans.code.overlapsRange(termPos))
return 0.25;
if (spans.pre.overlapsRange(termPos))
return 0.25;
if (spans.nav.overlapsRange(termPos))
return 0.25;
if (spans.pageHeader.overlapsRange(termPos))
return 0.25;
if (spans.pageFooter.overlapsRange(termPos))
return 0.25;
return 1.0;
}));
double overallPart = averageSentenceLengthPenalty double overallPart = averageSentenceLengthPenalty
+ documentLengthPenalty + documentLengthPenalty
+ qualityPenalty + qualityPenalty
@ -212,7 +248,11 @@ public class IndexResultScoreCalculator {
+ topologyBonus + topologyBonus
+ temporalBias + temporalBias
+ flagsPenalty + flagsPenalty
+ bestCoherence; + bestCoherenceAll
+ bestCoherenceTitle
+ bestCoherenceHeading
+ numCoherenceAll / 4.
+ spanWeightedScore;
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx)); double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
double tcfFirstPosition = 0.; double tcfFirstPosition = 0.;

View File

@ -2,6 +2,7 @@ package nu.marginalia.index.results.model;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
import nu.marginalia.index.forward.spans.DocumentSpan;
import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.model.SearchTermsUtil;
import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.index.results.model.ids.TermIdList;
import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.CodedSequence;
@ -40,7 +41,7 @@ public class TermCoherenceGroupList {
public int testOptional(CodedSequence[] positions) { public int testOptional(CodedSequence[] positions) {
int best = 0; int best = 0;
for (var coherenceSet : mandatoryGroups) { for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(positions)) { if (coherenceSet.test(positions)) {
best = Math.max(coherenceSet.size, best); best = Math.max(coherenceSet.size, best);
} }
@ -48,6 +49,25 @@ public class TermCoherenceGroupList {
return best; return best;
} }
public int countOptional(CodedSequence[] positions) {
int ct = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(positions)) {
ct++;
}
}
return ct;
}
public int testOptional(CodedSequence[] positions, DocumentSpan span) {
int best = 0;
for (var coherenceSet : optionalGroups) {
if (coherenceSet.test(span, positions)) {
best = Math.max(coherenceSet.size, best);
}
}
return best;
}
public static final class TermCoherenceGroup { public static final class TermCoherenceGroup {
private final int[] offsets; private final int[] offsets;
@ -92,5 +112,37 @@ public class TermCoherenceGroupList {
return SequenceOperations.intersectSequences(sequences); return SequenceOperations.intersectSequences(sequences);
} }
public boolean test(DocumentSpan span, CodedSequence[] positions) {
IntIterator[] sequences = new IntIterator[present.cardinality()];
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
if (!present.get(oi)) {
continue;
}
int offset = offsets[oi];
if (offset < 0)
return false;
// Create iterators that are offset by their relative position in the
// sequence. This is done by subtracting the index from the offset,
// so that when we intersect them, an overlap means that the terms are
// in the correct order. Note the offset is negative!
sequences[si++] = positions[offset].offsetIterator(-oi);
}
var intersections = SequenceOperations.findIntersections(sequences);
for (int idx = 0; idx < intersections.size(); idx++) {
if (span.containsRange(intersections.getInt(idx), sequences.length)) {
return true;
}
}
return false;
}
} }
} }

View File

@ -9,8 +9,8 @@ import nu.marginalia.hash.MurmurHash3_128;
import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.CombinedIndexReader;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;

View File

@ -11,8 +11,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.journal.IndexJournalSlopWriter;

View File

@ -13,8 +13,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.index.journal.IndexJournalSlopWriter;

View File

@ -1,6 +1,8 @@
package nu.marginalia.sequence; package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntArrayList;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntIterator;
import it.unimi.dsi.fastutil.ints.IntList;
public class SequenceOperations { public class SequenceOperations {
@ -30,7 +32,7 @@ public class SequenceOperations {
if (values[i] == max) { if (values[i] == max) {
successes++; successes++;
} else { } else {
successes = 0; successes = 1;
// Discard values until we reach the maximum value seen so far, // Discard values until we reach the maximum value seen so far,
// or until the end of the sequence is reached // or until the end of the sequence is reached
@ -49,6 +51,63 @@ public class SequenceOperations {
return true; return true;
} }
public static IntList findIntersections(IntIterator... sequences) {
if (sequences.length <= 1)
return IntList.of();
// Initialize values and find the maximum value
int[] values = new int[sequences.length];
for (int i = 0; i < sequences.length; i++) {
if (sequences[i].hasNext())
values[i] = sequences[i].nextInt();
else
return IntList.of();
}
// Intersect the sequences by advancing all values smaller than the maximum seen so far
// until they are equal to the maximum value, or until the end of the sequence is reached
int max = Integer.MIN_VALUE;
int successes = 0;
IntList ret = new IntArrayList();
outer:
for (int i = 0;; i = (i + 1) % sequences.length)
{
if (successes == sequences.length) {
ret.add(max);
successes = 1;
if (sequences[i].hasNext()) {
max = sequences[i].nextInt();
} else {
break;
}
} else if (values[i] == max) {
successes++;
} else {
successes = 1;
// Discard values until we reach the maximum value seen so far,
// or until the end of the sequence is reached
while (values[i] < max) {
if (sequences[i].hasNext()) {
values[i] = sequences[i].nextInt();
} else {
break outer;
}
}
// Update the maximum value, if necessary
max = Math.max(max, values[i]);
}
}
return ret;
}
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty. /** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
* */ * */
public static int minDistance(IntIterator seqA, IntIterator seqB) public static int minDistance(IntIterator seqA, IntIterator seqB)

View File

@ -162,7 +162,15 @@ public class BitReader {
} }
else { // There's no more data to read! else { // There's no more data to read!
refillCallback.run(); refillCallback.run();
readNext(); if (underlying.hasRemaining()) {
readNext();
}
else {
// We've attempted to re-fill the buffer, but there's still no data to read, so we fail to avoid
// blowing up the stack with recursion
throw new IllegalStateException("No more data to read after attempted re-fill of underlying buffer");
}
} }
} }
} }

View File

@ -1,6 +1,6 @@
package nu.marginalia.sequence; package nu.marginalia.sequence;
import it.unimi.dsi.fastutil.ints.IntIterator; import it.unimi.dsi.fastutil.ints.IntList;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.nio.ByteBuffer; import java.nio.ByteBuffer;
@ -63,6 +63,17 @@ class SequenceOperationsTest {
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator())); assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
} }
@Test
void intersectSequencesDeepMatch3findIntersections() {
ByteBuffer wa = ByteBuffer.allocate(1024);
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 10, 11);
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14);
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10);
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator()));
}
@Test @Test
void intersectSequencesDeepMismatch() { void intersectSequencesDeepMismatch() {
ByteBuffer wa = ByteBuffer.allocate(1024); ByteBuffer wa = ByteBuffer.allocate(1024);

View File

@ -54,8 +54,8 @@ public class HtmlStringTagger implements NodeVisitor {
case "code" -> pushTag(HtmlTag.CODE, el); case "code" -> pushTag(HtmlTag.CODE, el);
case "title" -> pushTag(HtmlTag.TITLE, el); case "title" -> pushTag(HtmlTag.TITLE, el);
case "nav" -> pushTag(HtmlTag.NAV, el); case "nav" -> pushTag(HtmlTag.NAV, el);
case "header" -> pushTag(HtmlTag.HEADER, el); case "header" -> pushTag(HtmlTag.PAGE_HEADER, el);
case "footer" -> pushTag(HtmlTag.FOOTER, el); case "footer" -> pushTag(HtmlTag.PAGE_FOOTER, el);
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el); case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
} }
} }

View File

@ -1,21 +1,21 @@
package nu.marginalia.language.sentence.tag; package nu.marginalia.language.sentence.tag;
public enum HtmlTag { public enum HtmlTag {
SCRIPT('s', true, false), SCRIPT((byte) 's', true, false),
STYLE('S', true, false), STYLE((byte) 'S', true, false),
CODE('c', false, true), CODE((byte) 'c', false, true),
PRE('p', false, true), PRE((byte) 'p', false, true),
TITLE('t', false, false), TITLE((byte) 't', false, false),
HEADING('h', false, false), HEADING((byte) 'h', false, false),
NAV('n', false, false), NAV((byte) 'n', false, false),
HEADER('H',false, false), PAGE_HEADER((byte) 'H',false, false),
FOOTER('f', false, false); PAGE_FOOTER((byte) 'f', false, false);
public char code; public byte code;
public boolean exclude; public boolean exclude;
public boolean nonLanguage; public boolean nonLanguage;
HtmlTag(char code, boolean exclude, boolean nonLanguage) { HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
this.code = code; this.code = code;
this.exclude = exclude; this.exclude = exclude;
this.nonLanguage = nonLanguage; this.nonLanguage = nonLanguage;

View File

@ -144,7 +144,7 @@ public class DocumentKeywordsBuilder {
public void addSpans(List<DocumentWordSpan> newSpans) { public void addSpans(List<DocumentWordSpan> newSpans) {
for (var span : newSpans) { for (var span : newSpans) {
wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span); wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
} }
} }

View File

@ -9,8 +9,8 @@ import nu.marginalia.ProcessConfigurationModule;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.gson.GsonFactory;
import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.id.UrlIdCodec;

View File

@ -18,8 +18,8 @@ import nu.marginalia.index.ReverseIndexPrioFileNames;
import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.full.FullIndexConstructor;
import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor;
import nu.marginalia.index.domainrankings.DomainRankings; import nu.marginalia.index.domainrankings.DomainRankings;
import nu.marginalia.index.forward.ForwardIndexConverter;
import nu.marginalia.index.forward.ForwardIndexFileNames; import nu.marginalia.index.forward.ForwardIndexFileNames;
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.index.StatefulIndex;
import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournal;
import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.model.SearchParameters;