mirror of
https://github.com/MarginaliaSearch/MarginaliaSearch.git
synced 2025-02-23 13:09:00 +00:00
(index) Experimental initial integration of document spans into index
This commit is contained in:
parent
80900107f7
commit
b316b55be9
@ -59,13 +59,4 @@ public class CompiledQueryAggregates {
|
||||
return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query)));
|
||||
}
|
||||
|
||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
||||
public static <T> LongSet positionsAggregate(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
||||
}
|
||||
|
||||
/** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */
|
||||
public static <T> LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) {
|
||||
return query.root().visit(new CqPositionsOperator(query, operator));
|
||||
}
|
||||
}
|
||||
|
@ -1,85 +0,0 @@
|
||||
package nu.marginalia.api.searchquery.model.compiled.aggregate;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.LongArraySet;
|
||||
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
|
||||
import it.unimi.dsi.fastutil.longs.LongSet;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CqExpression;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.function.IntToLongFunction;
|
||||
import java.util.function.LongUnaryOperator;
|
||||
import java.util.function.ToLongFunction;
|
||||
|
||||
public class CqPositionsOperator implements CqExpression.ObjectVisitor<LongSet> {
|
||||
private final IntToLongFunction operator;
|
||||
|
||||
public <T> CqPositionsOperator(CompiledQuery<T> query, ToLongFunction<T> operator) {
|
||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
||||
}
|
||||
|
||||
public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) {
|
||||
this.operator = idx -> operator.applyAsLong(query.at(idx));
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onAnd(List<? extends CqExpression> parts) {
|
||||
LongSet ret = new LongArraySet();
|
||||
|
||||
for (var part : parts) {
|
||||
ret = comineSets(ret, part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
private LongSet comineSets(LongSet a, LongSet b) {
|
||||
if (a.isEmpty())
|
||||
return b;
|
||||
if (b.isEmpty())
|
||||
return a;
|
||||
|
||||
LongSet ret = newSet(a.size() * b.size());
|
||||
|
||||
var ai = a.longIterator();
|
||||
|
||||
while (ai.hasNext()) {
|
||||
long aval = ai.nextLong();
|
||||
|
||||
var bi = b.longIterator();
|
||||
while (bi.hasNext()) {
|
||||
ret.add(aval & bi.nextLong());
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onOr(List<? extends CqExpression> parts) {
|
||||
LongSet ret = newSet(parts.size());
|
||||
|
||||
for (var part : parts) {
|
||||
ret.addAll(part.visit(this));
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@Override
|
||||
public LongSet onLeaf(int idx) {
|
||||
var set = newSet(1);
|
||||
set.add(operator.applyAsLong(idx));
|
||||
return set;
|
||||
}
|
||||
|
||||
/** Allocate a new set suitable for a collection with the provided cardinality */
|
||||
private LongSet newSet(int cardinality) {
|
||||
if (cardinality < 8)
|
||||
return new LongArraySet(cardinality);
|
||||
else
|
||||
return new LongOpenHashSet(cardinality);
|
||||
}
|
||||
|
||||
}
|
@ -17,6 +17,7 @@ dependencies {
|
||||
implementation project(':code:libraries:btree')
|
||||
implementation project(':code:libraries:slop')
|
||||
implementation project(':code:libraries:coded-sequence')
|
||||
implementation project(':code:libraries:language-processing')
|
||||
implementation project(':code:index:query')
|
||||
implementation project(':code:index:index-journal')
|
||||
implementation project(':code:common:model')
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
class ForwardIndexParameters {
|
||||
public class ForwardIndexParameters {
|
||||
public static final int ENTRY_SIZE = 3;
|
||||
public static final int METADATA_OFFSET = 0;
|
||||
public static final int FEATURES_OFFSET = 1;
|
||||
|
@ -3,11 +3,14 @@ package nu.marginalia.index.forward;
|
||||
import gnu.trove.map.hash.TLongIntHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.foreign.Arena;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
@ -30,6 +33,7 @@ public class ForwardIndexReader {
|
||||
private final LongArray data;
|
||||
|
||||
private final ForwardIndexSpansReader spansReader;
|
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(getClass());
|
||||
|
||||
public ForwardIndexReader(Path idsFile,
|
||||
@ -121,6 +125,21 @@ public class ForwardIndexReader {
|
||||
return idToOffset.get(docId);
|
||||
}
|
||||
|
||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||
long offset = idxForDoc(docId);
|
||||
if (offset < 0) return new DocumentSpans();
|
||||
|
||||
long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET);
|
||||
|
||||
try {
|
||||
return spansReader.readSpans(arena, encodedOffset);
|
||||
}
|
||||
catch (IOException ex) {
|
||||
logger.error("Failed to read spans for doc " + docId, ex);
|
||||
return new DocumentSpans();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public int totalDocCount() {
|
||||
return idToOffset.size();
|
||||
|
@ -1,9 +1,11 @@
|
||||
package nu.marginalia.index.forward;
|
||||
package nu.marginalia.index.forward.construction;
|
||||
|
||||
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
|
||||
import nu.marginalia.array.LongArray;
|
||||
import nu.marginalia.array.LongArrayFactory;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexParameters;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
import nu.marginalia.model.idx.DocumentMetadata;
|
@ -0,0 +1,77 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
|
||||
public class DocumentSpan {
|
||||
|
||||
/** A list of the interlaced start and end positions of each span in the document of this type */
|
||||
private final CodedSequence startsEnds;
|
||||
|
||||
public DocumentSpan(CodedSequence startsEnds) {
|
||||
this.startsEnds = startsEnds;
|
||||
}
|
||||
|
||||
public DocumentSpan() {
|
||||
this.startsEnds = null;
|
||||
}
|
||||
|
||||
public boolean containsPosition(int position) {
|
||||
if (startsEnds == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var iter = startsEnds.iterator();
|
||||
while (iter.hasNext()) {
|
||||
int start = iter.nextInt();
|
||||
if (start > position) {
|
||||
return false;
|
||||
}
|
||||
int end = iter.nextInt();
|
||||
if (end > position) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean containsRange(int rangeStart, int len) {
|
||||
if (startsEnds == null) {
|
||||
return false;
|
||||
}
|
||||
|
||||
var iter = startsEnds.iterator();
|
||||
while (iter.hasNext()) {
|
||||
int start = iter.nextInt();
|
||||
if (start > rangeStart) {
|
||||
return false;
|
||||
}
|
||||
int end = iter.nextInt();
|
||||
if (end > rangeStart + len) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean overlapsRange(CodedSequence sequence) {
|
||||
return SequenceOperations.intersectSequences(iterator(), sequence.iterator());
|
||||
}
|
||||
|
||||
/** Returns an iterator over the start and end positions of each span in the document of this type */
|
||||
public IntIterator iterator() {
|
||||
if (null == startsEnds) {
|
||||
return IntList.of().iterator();
|
||||
}
|
||||
|
||||
return startsEnds.iterator();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return startsEnds.valueCount() / 2;
|
||||
}
|
||||
}
|
@ -0,0 +1,35 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import nu.marginalia.language.sentence.tag.HtmlTag;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
|
||||
public class DocumentSpans {
|
||||
private static DocumentSpan EMPTY_SPAN = new DocumentSpan();
|
||||
|
||||
public DocumentSpan title = EMPTY_SPAN;
|
||||
public DocumentSpan heading = EMPTY_SPAN;
|
||||
|
||||
public DocumentSpan nav = EMPTY_SPAN;
|
||||
public DocumentSpan pageHeader = EMPTY_SPAN;
|
||||
public DocumentSpan pageFooter = EMPTY_SPAN;
|
||||
public DocumentSpan code = EMPTY_SPAN;
|
||||
public DocumentSpan pre = EMPTY_SPAN;
|
||||
|
||||
void accept(byte code, CodedSequence positions) {
|
||||
if (code == HtmlTag.HEADING.code)
|
||||
this.heading = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.TITLE.code)
|
||||
this.title = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.NAV.code)
|
||||
this.nav = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.PAGE_HEADER.code)
|
||||
this.pageHeader = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.PAGE_FOOTER.code)
|
||||
this.pageFooter = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.CODE.code)
|
||||
this.code = new DocumentSpan(positions);
|
||||
else if (code == HtmlTag.PRE.code)
|
||||
this.pre = new DocumentSpan(positions);
|
||||
}
|
||||
|
||||
}
|
@ -1,6 +1,5 @@
|
||||
package nu.marginalia.index.forward;
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
|
||||
import java.io.IOException;
|
||||
@ -9,8 +8,6 @@ import java.nio.channels.FileChannel;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.StandardOpenOption;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
@SuppressWarnings("preview")
|
||||
public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
@ -20,9 +17,9 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ);
|
||||
}
|
||||
|
||||
public List<SpanData> readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
long size = encodedOffset & 0xFFF_FFFF;
|
||||
long offset = encodedOffset >>> 28;
|
||||
public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException {
|
||||
long size = SpansCodec.decodeSize(encodedOffset);
|
||||
long offset = SpansCodec.decodeStartOffset(encodedOffset);
|
||||
|
||||
var buffer = arena.allocate(size).asByteBuffer();
|
||||
buffer.clear();
|
||||
@ -33,22 +30,16 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
|
||||
int count = buffer.get();
|
||||
|
||||
List<SpanData> ret = new ArrayList<>();
|
||||
DocumentSpans ret = new DocumentSpans();
|
||||
|
||||
while (count-- > 0) {
|
||||
byte code = buffer.get();
|
||||
short len = buffer.getShort();
|
||||
|
||||
final int pos = buffer.position();
|
||||
|
||||
// Decode the gamma-coded sequence; this will advance the buffer position
|
||||
// in a not entirely predictable way, so we need to save the position
|
||||
buffer.limit(buffer.position() + len);
|
||||
var sequence = new GammaCodedSequence(buffer).values();
|
||||
ret.add(new SpanData(code, sequence));
|
||||
ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len)));
|
||||
|
||||
// Reset the buffer position to the end of the span
|
||||
buffer.position(pos + len);
|
||||
buffer.limit(buffer.capacity());
|
||||
buffer.position(buffer.position() + len);
|
||||
}
|
||||
|
||||
return ret;
|
||||
@ -59,5 +50,4 @@ public class ForwardIndexSpansReader implements AutoCloseable {
|
||||
spansFileChannel.close();
|
||||
}
|
||||
|
||||
public record SpanData(byte code, IntList data) {}
|
||||
}
|
@ -1,4 +1,4 @@
|
||||
package nu.marginalia.index.forward;
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.ByteBuffer;
|
||||
@ -42,8 +42,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable {
|
||||
}
|
||||
|
||||
public long endRecord() {
|
||||
return stateStartOffset << 28 | stateLength;
|
||||
|
||||
return SpansCodec.encode(stateStartOffset, stateLength);
|
||||
}
|
||||
|
||||
@Override
|
@ -0,0 +1,17 @@
|
||||
package nu.marginalia.index.forward.spans;
|
||||
|
||||
public class SpansCodec {
|
||||
public static long encode(long startOffset, long size) {
|
||||
assert size < 0x1000_0000L : "Size must be less than 2^28";
|
||||
|
||||
return startOffset << 28 | (size & 0xFFF_FFFFL);
|
||||
}
|
||||
|
||||
public static long decodeStartOffset(long encoded) {
|
||||
return encoded >>> 28;
|
||||
}
|
||||
|
||||
public static long decodeSize(long encoded) {
|
||||
return encoded & 0x0FFF_FFFFL;
|
||||
}
|
||||
}
|
@ -2,6 +2,7 @@ package nu.marginalia.index.forward;
|
||||
|
||||
import lombok.SneakyThrows;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
@ -1,6 +1,7 @@
|
||||
package nu.marginalia.index.forward;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansReader;
|
||||
import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter;
|
||||
import nu.marginalia.sequence.GammaCodedSequence;
|
||||
import org.junit.jupiter.api.AfterEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
@ -11,7 +12,7 @@ import java.nio.ByteBuffer;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
class ForwardIndexSpansReaderTest {
|
||||
Path testFile = Files.createTempFile("test", ".idx");
|
||||
@ -32,12 +33,12 @@ class ForwardIndexSpansReaderTest {
|
||||
long offset2;
|
||||
try (var writer = new ForwardIndexSpansWriter(testFile)) {
|
||||
writer.beginRecord(1);
|
||||
writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer());
|
||||
writer.writeSpan((byte) 'h', GammaCodedSequence.generate(wa, 1, 3, 5, 8).buffer());
|
||||
offset1 = writer.endRecord();
|
||||
|
||||
writer.beginRecord(2);
|
||||
writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer());
|
||||
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer());
|
||||
writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 2, 4, 6, 7).buffer());
|
||||
writer.writeSpan((byte) 'p', GammaCodedSequence.generate(wa, 3, 5).buffer());
|
||||
offset2 = writer.endRecord();
|
||||
}
|
||||
|
||||
@ -47,17 +48,21 @@ class ForwardIndexSpansReaderTest {
|
||||
var spans1 = reader.readSpans(arena, offset1);
|
||||
var spans2 = reader.readSpans(arena, offset2);
|
||||
|
||||
assertEquals(1, spans1.size());
|
||||
assertEquals(2, spans1.heading.size());
|
||||
|
||||
assertEquals('a', spans1.get(0).code());
|
||||
assertEquals(IntList.of(1, 3, 5), spans1.get(0).data());
|
||||
assertEquals(2, spans2.code.size());
|
||||
|
||||
assertEquals(2, spans2.size());
|
||||
assertFalse(spans2.code.containsPosition(1));
|
||||
assertTrue(spans2.code.containsPosition(3));
|
||||
assertFalse(spans2.code.containsPosition(5));
|
||||
assertTrue(spans2.code.containsPosition(6));
|
||||
assertFalse(spans2.code.containsPosition(7));
|
||||
assertFalse(spans2.code.containsPosition(8));
|
||||
|
||||
assertEquals('b', spans2.get(0).code());
|
||||
assertEquals(IntList.of(2, 4, 6), spans2.get(0).data());
|
||||
assertEquals('c', spans2.get(1).code());
|
||||
assertEquals(IntList.of(3, 5, 7), spans2.get(1).data());
|
||||
assertEquals(1, spans2.pre.size());
|
||||
|
||||
assertEquals(0, spans2.pageFooter.size());
|
||||
assertFalse(spans2.pageFooter.containsPosition(8));
|
||||
}
|
||||
}
|
||||
}
|
@ -317,6 +317,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase {
|
||||
try {
|
||||
executeSearch();
|
||||
}
|
||||
catch (Exception ex) {
|
||||
logger.error("Error in index lookup", ex);
|
||||
}
|
||||
finally {
|
||||
synchronized (remainingIndexTasks) {
|
||||
if (remainingIndexTasks.decrementAndGet() == 0) {
|
||||
|
@ -8,6 +8,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre
|
||||
import nu.marginalia.index.FullReverseIndexReader;
|
||||
import nu.marginalia.index.PrioReverseIndexReader;
|
||||
import nu.marginalia.index.forward.ForwardIndexReader;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchTerms;
|
||||
import nu.marginalia.index.query.IndexQuery;
|
||||
@ -186,11 +187,17 @@ public class CombinedIndexReader {
|
||||
/** Retrieves the HTML features for the specified document */
|
||||
public int getHtmlFeatures(long docId) {
|
||||
return forwardIndexReader.getHtmlFeatures(docId);
|
||||
} /** Retrieves the HTML features for the specified document */
|
||||
}
|
||||
|
||||
/** Retrieves the HTML features for the specified document */
|
||||
public int getDocumentSize(long docId) {
|
||||
return forwardIndexReader.getDocumentSize(docId);
|
||||
}
|
||||
|
||||
/** Retrieves the document spans for the specified document */
|
||||
public DocumentSpans getDocumentSpans(Arena arena, long docId) {
|
||||
return forwardIndexReader.getDocumentSpans(arena, docId);
|
||||
}
|
||||
|
||||
/** Close the indexes (this is not done immediately)
|
||||
* */
|
||||
|
@ -98,7 +98,7 @@ public class IndexResultRankingService {
|
||||
}
|
||||
|
||||
// Calculate the preliminary score
|
||||
var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions);
|
||||
var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions);
|
||||
if (score != null) {
|
||||
results.add(score);
|
||||
}
|
||||
|
@ -3,15 +3,18 @@ package nu.marginalia.index.results;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQuery;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt;
|
||||
import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong;
|
||||
import nu.marginalia.api.searchquery.model.compiled.aggregate.CqDoubleSumOperator;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingContext;
|
||||
import nu.marginalia.api.searchquery.model.results.ResultRankingParameters;
|
||||
import nu.marginalia.api.searchquery.model.results.SearchResultItem;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpans;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.model.QueryParams;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
import nu.marginalia.index.query.limit.QueryStrategy;
|
||||
import nu.marginalia.index.results.model.QuerySearchTerms;
|
||||
import nu.marginalia.index.results.model.TermCoherenceGroupList;
|
||||
import nu.marginalia.model.crawl.HtmlFeature;
|
||||
import nu.marginalia.model.crawl.PubDate;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
@ -22,6 +25,7 @@ import nu.marginalia.sequence.CodedSequence;
|
||||
import nu.marginalia.sequence.SequenceOperations;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
import java.lang.foreign.Arena;
|
||||
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate;
|
||||
import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate;
|
||||
@ -50,7 +54,8 @@ public class IndexResultScoreCalculator {
|
||||
private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit();
|
||||
|
||||
@Nullable
|
||||
public SearchResultItem calculateScore(long combinedId,
|
||||
public SearchResultItem calculateScore(Arena arena,
|
||||
long combinedId,
|
||||
QuerySearchTerms searchTerms,
|
||||
long[] wordFlags,
|
||||
CodedSequence[] positions)
|
||||
@ -78,8 +83,7 @@ public class IndexResultScoreCalculator {
|
||||
long docMetadata = index.getDocumentMetadata(docId);
|
||||
int htmlFeatures = index.getHtmlFeatures(docId);
|
||||
int docSize = index.getDocumentSize(docId);
|
||||
|
||||
int bestCoherence = searchTerms.coherences.testOptional(positions);
|
||||
DocumentSpans spans = index.getDocumentSpans(arena, docId);
|
||||
|
||||
double score = calculateSearchResultValue(
|
||||
wordFlagsQuery,
|
||||
@ -88,7 +92,9 @@ public class IndexResultScoreCalculator {
|
||||
docMetadata,
|
||||
htmlFeatures,
|
||||
docSize,
|
||||
bestCoherence,
|
||||
spans,
|
||||
positions,
|
||||
searchTerms.coherences,
|
||||
rankingContext);
|
||||
|
||||
SearchResultItem searchResult = new SearchResultItem(docId,
|
||||
@ -169,10 +175,13 @@ public class IndexResultScoreCalculator {
|
||||
|
||||
public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery,
|
||||
CompiledQueryInt positionsCountQuery,
|
||||
CompiledQuery<CodedSequence> positionsQuery, long documentMetadata,
|
||||
CompiledQuery<CodedSequence> positionsQuery,
|
||||
long documentMetadata,
|
||||
int features,
|
||||
int length,
|
||||
int bestCoherence,
|
||||
DocumentSpans spans,
|
||||
CodedSequence[] positions,
|
||||
TermCoherenceGroupList coherences,
|
||||
ResultRankingContext ctx)
|
||||
{
|
||||
if (length < 0) {
|
||||
@ -205,6 +214,33 @@ public class IndexResultScoreCalculator {
|
||||
temporalBias = 0;
|
||||
}
|
||||
|
||||
|
||||
int numCoherenceAll = coherences.countOptional(positions);
|
||||
int bestCoherenceAll = coherences.testOptional(positions);
|
||||
int bestCoherenceTitle = coherences.testOptional(positions, spans.title);
|
||||
int bestCoherenceHeading = coherences.testOptional(positions, spans.heading);
|
||||
|
||||
double spanWeightedScore = positionsQuery.root.visit(new CqDoubleSumOperator(positionsQuery, termPos -> {
|
||||
if (termPos == null)
|
||||
return 0;
|
||||
|
||||
if (spans.title.overlapsRange(termPos))
|
||||
return 5.0;
|
||||
if (spans.heading.overlapsRange(termPos))
|
||||
return 2.5;
|
||||
if (spans.code.overlapsRange(termPos))
|
||||
return 0.25;
|
||||
if (spans.pre.overlapsRange(termPos))
|
||||
return 0.25;
|
||||
if (spans.nav.overlapsRange(termPos))
|
||||
return 0.25;
|
||||
if (spans.pageHeader.overlapsRange(termPos))
|
||||
return 0.25;
|
||||
if (spans.pageFooter.overlapsRange(termPos))
|
||||
return 0.25;
|
||||
return 1.0;
|
||||
}));
|
||||
|
||||
double overallPart = averageSentenceLengthPenalty
|
||||
+ documentLengthPenalty
|
||||
+ qualityPenalty
|
||||
@ -212,7 +248,11 @@ public class IndexResultScoreCalculator {
|
||||
+ topologyBonus
|
||||
+ temporalBias
|
||||
+ flagsPenalty
|
||||
+ bestCoherence;
|
||||
+ bestCoherenceAll
|
||||
+ bestCoherenceTitle
|
||||
+ bestCoherenceHeading
|
||||
+ numCoherenceAll / 4.
|
||||
+ spanWeightedScore;
|
||||
|
||||
double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx));
|
||||
double tcfFirstPosition = 0.;
|
||||
|
@ -2,6 +2,7 @@ package nu.marginalia.index.results.model;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint;
|
||||
import nu.marginalia.index.forward.spans.DocumentSpan;
|
||||
import nu.marginalia.index.model.SearchTermsUtil;
|
||||
import nu.marginalia.index.results.model.ids.TermIdList;
|
||||
import nu.marginalia.sequence.CodedSequence;
|
||||
@ -40,7 +41,7 @@ public class TermCoherenceGroupList {
|
||||
|
||||
public int testOptional(CodedSequence[] positions) {
|
||||
int best = 0;
|
||||
for (var coherenceSet : mandatoryGroups) {
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (coherenceSet.test(positions)) {
|
||||
best = Math.max(coherenceSet.size, best);
|
||||
}
|
||||
@ -48,6 +49,25 @@ public class TermCoherenceGroupList {
|
||||
return best;
|
||||
}
|
||||
|
||||
public int countOptional(CodedSequence[] positions) {
|
||||
int ct = 0;
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (coherenceSet.test(positions)) {
|
||||
ct++;
|
||||
}
|
||||
}
|
||||
return ct;
|
||||
}
|
||||
|
||||
public int testOptional(CodedSequence[] positions, DocumentSpan span) {
|
||||
int best = 0;
|
||||
for (var coherenceSet : optionalGroups) {
|
||||
if (coherenceSet.test(span, positions)) {
|
||||
best = Math.max(coherenceSet.size, best);
|
||||
}
|
||||
}
|
||||
return best;
|
||||
}
|
||||
|
||||
public static final class TermCoherenceGroup {
|
||||
private final int[] offsets;
|
||||
@ -92,5 +112,37 @@ public class TermCoherenceGroupList {
|
||||
|
||||
return SequenceOperations.intersectSequences(sequences);
|
||||
}
|
||||
|
||||
|
||||
public boolean test(DocumentSpan span, CodedSequence[] positions) {
|
||||
IntIterator[] sequences = new IntIterator[present.cardinality()];
|
||||
|
||||
for (int oi = 0, si = 0; oi < offsets.length; oi++) {
|
||||
if (!present.get(oi)) {
|
||||
continue;
|
||||
}
|
||||
int offset = offsets[oi];
|
||||
if (offset < 0)
|
||||
return false;
|
||||
|
||||
// Create iterators that are offset by their relative position in the
|
||||
// sequence. This is done by subtracting the index from the offset,
|
||||
// so that when we intersect them, an overlap means that the terms are
|
||||
// in the correct order. Note the offset is negative!
|
||||
|
||||
sequences[si++] = positions[offset].offsetIterator(-oi);
|
||||
}
|
||||
|
||||
var intersections = SequenceOperations.findIntersections(sequences);
|
||||
|
||||
for (int idx = 0; idx < intersections.size(); idx++) {
|
||||
if (span.containsRange(intersections.getInt(idx), sequences.length)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
@ -9,8 +9,8 @@ import nu.marginalia.hash.MurmurHash3_128;
|
||||
import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.index.CombinedIndexReader;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
|
@ -11,8 +11,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
|
@ -13,8 +13,8 @@ import nu.marginalia.index.construction.DocIdRewriter;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.journal.IndexJournalSlopWriter;
|
||||
|
@ -1,6 +1,8 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntArrayList;
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
|
||||
public class SequenceOperations {
|
||||
|
||||
@ -30,7 +32,7 @@ public class SequenceOperations {
|
||||
if (values[i] == max) {
|
||||
successes++;
|
||||
} else {
|
||||
successes = 0;
|
||||
successes = 1;
|
||||
|
||||
// Discard values until we reach the maximum value seen so far,
|
||||
// or until the end of the sequence is reached
|
||||
@ -49,6 +51,63 @@ public class SequenceOperations {
|
||||
return true;
|
||||
}
|
||||
|
||||
public static IntList findIntersections(IntIterator... sequences) {
|
||||
|
||||
if (sequences.length <= 1)
|
||||
return IntList.of();
|
||||
|
||||
// Initialize values and find the maximum value
|
||||
int[] values = new int[sequences.length];
|
||||
|
||||
for (int i = 0; i < sequences.length; i++) {
|
||||
if (sequences[i].hasNext())
|
||||
values[i] = sequences[i].nextInt();
|
||||
else
|
||||
return IntList.of();
|
||||
}
|
||||
|
||||
// Intersect the sequences by advancing all values smaller than the maximum seen so far
|
||||
// until they are equal to the maximum value, or until the end of the sequence is reached
|
||||
int max = Integer.MIN_VALUE;
|
||||
int successes = 0;
|
||||
|
||||
IntList ret = new IntArrayList();
|
||||
|
||||
outer:
|
||||
for (int i = 0;; i = (i + 1) % sequences.length)
|
||||
{
|
||||
if (successes == sequences.length) {
|
||||
ret.add(max);
|
||||
successes = 1;
|
||||
|
||||
if (sequences[i].hasNext()) {
|
||||
max = sequences[i].nextInt();
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else if (values[i] == max) {
|
||||
successes++;
|
||||
} else {
|
||||
successes = 1;
|
||||
|
||||
// Discard values until we reach the maximum value seen so far,
|
||||
// or until the end of the sequence is reached
|
||||
while (values[i] < max) {
|
||||
if (sequences[i].hasNext()) {
|
||||
values[i] = sequences[i].nextInt();
|
||||
} else {
|
||||
break outer;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the maximum value, if necessary
|
||||
max = Math.max(max, values[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Return the minimum word distance between two sequences, or a negative value if either sequence is empty.
|
||||
* */
|
||||
public static int minDistance(IntIterator seqA, IntIterator seqB)
|
||||
|
@ -162,7 +162,15 @@ public class BitReader {
|
||||
}
|
||||
else { // There's no more data to read!
|
||||
refillCallback.run();
|
||||
readNext();
|
||||
if (underlying.hasRemaining()) {
|
||||
readNext();
|
||||
}
|
||||
else {
|
||||
// We've attempted to re-fill the buffer, but there's still no data to read, so we fail to avoid
|
||||
// blowing up the stack with recursion
|
||||
throw new IllegalStateException("No more data to read after attempted re-fill of underlying buffer");
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
package nu.marginalia.sequence;
|
||||
|
||||
import it.unimi.dsi.fastutil.ints.IntIterator;
|
||||
import it.unimi.dsi.fastutil.ints.IntList;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
@ -63,6 +63,17 @@ class SequenceOperationsTest {
|
||||
assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||
}
|
||||
|
||||
@Test
|
||||
void intersectSequencesDeepMatch3findIntersections() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 10, 11);
|
||||
GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14);
|
||||
GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10);
|
||||
|
||||
assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator()));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
void intersectSequencesDeepMismatch() {
|
||||
ByteBuffer wa = ByteBuffer.allocate(1024);
|
||||
|
@ -54,8 +54,8 @@ public class HtmlStringTagger implements NodeVisitor {
|
||||
case "code" -> pushTag(HtmlTag.CODE, el);
|
||||
case "title" -> pushTag(HtmlTag.TITLE, el);
|
||||
case "nav" -> pushTag(HtmlTag.NAV, el);
|
||||
case "header" -> pushTag(HtmlTag.HEADER, el);
|
||||
case "footer" -> pushTag(HtmlTag.FOOTER, el);
|
||||
case "header" -> pushTag(HtmlTag.PAGE_HEADER, el);
|
||||
case "footer" -> pushTag(HtmlTag.PAGE_FOOTER, el);
|
||||
case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el);
|
||||
}
|
||||
}
|
||||
|
@ -1,21 +1,21 @@
|
||||
package nu.marginalia.language.sentence.tag;
|
||||
|
||||
public enum HtmlTag {
|
||||
SCRIPT('s', true, false),
|
||||
STYLE('S', true, false),
|
||||
CODE('c', false, true),
|
||||
PRE('p', false, true),
|
||||
TITLE('t', false, false),
|
||||
HEADING('h', false, false),
|
||||
NAV('n', false, false),
|
||||
HEADER('H',false, false),
|
||||
FOOTER('f', false, false);
|
||||
SCRIPT((byte) 's', true, false),
|
||||
STYLE((byte) 'S', true, false),
|
||||
CODE((byte) 'c', false, true),
|
||||
PRE((byte) 'p', false, true),
|
||||
TITLE((byte) 't', false, false),
|
||||
HEADING((byte) 'h', false, false),
|
||||
NAV((byte) 'n', false, false),
|
||||
PAGE_HEADER((byte) 'H',false, false),
|
||||
PAGE_FOOTER((byte) 'f', false, false);
|
||||
|
||||
public char code;
|
||||
public byte code;
|
||||
public boolean exclude;
|
||||
public boolean nonLanguage;
|
||||
|
||||
HtmlTag(char code, boolean exclude, boolean nonLanguage) {
|
||||
HtmlTag(byte code, boolean exclude, boolean nonLanguage) {
|
||||
this.code = code;
|
||||
this.exclude = exclude;
|
||||
this.nonLanguage = nonLanguage;
|
||||
|
@ -144,7 +144,7 @@ public class DocumentKeywordsBuilder {
|
||||
|
||||
public void addSpans(List<DocumentWordSpan> newSpans) {
|
||||
for (var span : newSpans) {
|
||||
wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span);
|
||||
wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -9,8 +9,8 @@ import nu.marginalia.ProcessConfigurationModule;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.model.gson.GsonFactory;
|
||||
import nu.marginalia.model.id.UrlIdCodec;
|
||||
|
@ -18,8 +18,8 @@ import nu.marginalia.index.ReverseIndexPrioFileNames;
|
||||
import nu.marginalia.index.construction.full.FullIndexConstructor;
|
||||
import nu.marginalia.index.construction.prio.PrioIndexConstructor;
|
||||
import nu.marginalia.index.domainrankings.DomainRankings;
|
||||
import nu.marginalia.index.forward.ForwardIndexConverter;
|
||||
import nu.marginalia.index.forward.ForwardIndexFileNames;
|
||||
import nu.marginalia.index.forward.construction.ForwardIndexConverter;
|
||||
import nu.marginalia.index.index.StatefulIndex;
|
||||
import nu.marginalia.index.journal.IndexJournal;
|
||||
import nu.marginalia.index.model.SearchParameters;
|
||||
|
Loading…
Reference in New Issue
Block a user