From b316b55be99386f0b4e42b600b5160cbc75c2731 Mon Sep 17 00:00:00 2001 From: Viktor Lofgren Date: Tue, 30 Jul 2024 12:01:53 +0200 Subject: [PATCH] (index) Experimental initial integration of document spans into index --- .../aggregate/CompiledQueryAggregates.java | 9 -- .../aggregate/CqPositionsOperator.java | 85 ------------------- code/index/index-forward/build.gradle | 1 + .../index/forward/ForwardIndexParameters.java | 2 +- .../index/forward/ForwardIndexReader.java | 19 +++++ .../ForwardIndexConverter.java | 4 +- .../index/forward/spans/DocumentSpan.java | 77 +++++++++++++++++ .../index/forward/spans/DocumentSpans.java | 35 ++++++++ .../{ => spans}/ForwardIndexSpansReader.java | 26 ++---- .../{ => spans}/ForwardIndexSpansWriter.java | 5 +- .../index/forward/spans/SpansCodec.java | 17 ++++ .../forward/ForwardIndexConverterTest.java | 1 + .../forward/ForwardIndexSpansReaderTest.java | 31 ++++--- .../nu/marginalia/index/IndexGrpcService.java | 3 + .../index/index/CombinedIndexReader.java | 9 +- .../results/IndexResultRankingService.java | 2 +- .../results/IndexResultScoreCalculator.java | 54 ++++++++++-- .../results/model/TermCoherenceGroupList.java | 54 +++++++++++- .../index/CombinedIndexReaderTest.java | 2 +- ...IndexQueryServiceIntegrationSmokeTest.java | 2 +- .../IndexQueryServiceIntegrationTest.java | 2 +- .../sequence/SequenceOperations.java | 61 ++++++++++++- .../nu/marginalia/sequence/io/BitReader.java | 10 ++- .../sequence/SequenceOperationsTest.java | 13 ++- .../sentence/tag/HtmlStringTagger.java | 4 +- .../language/sentence/tag/HtmlTag.java | 22 ++--- .../model/DocumentKeywordsBuilder.java | 2 +- .../index/IndexConstructorMain.java | 2 +- .../test/nu/marginalia/IntegrationTest.java | 2 +- 29 files changed, 394 insertions(+), 162 deletions(-) delete mode 100644 code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java rename code/index/index-forward/java/nu/marginalia/index/forward/{ => construction}/ForwardIndexConverter.java (97%) create mode 100644 code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java create mode 100644 code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java rename code/index/index-forward/java/nu/marginalia/index/forward/{ => spans}/ForwardIndexSpansReader.java (56%) rename code/index/index-forward/java/nu/marginalia/index/forward/{ => spans}/ForwardIndexSpansWriter.java (93%) create mode 100644 code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java index 2ca45dca..7dd48394 100644 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java +++ b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CompiledQueryAggregates.java @@ -59,13 +59,4 @@ public class CompiledQueryAggregates { return new ArrayList<>(query.root().visit(new CqQueryPathsOperator(query))); } - /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ - public static LongSet positionsAggregate(CompiledQuery query, ToLongFunction operator) { - return query.root().visit(new CqPositionsOperator(query, operator)); - } - - /** Using the bitwise AND operator, aggregate all possible combined values of the long generated by the provided operator */ - public static LongSet positionsAggregate(CompiledQueryLong query, LongUnaryOperator operator) { - return query.root().visit(new CqPositionsOperator(query, operator)); - } } diff --git a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java b/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java deleted file mode 100644 index 715c4cb2..00000000 --- a/code/functions/search-query/api/java/nu/marginalia/api/searchquery/model/compiled/aggregate/CqPositionsOperator.java +++ /dev/null @@ -1,85 +0,0 @@ -package nu.marginalia.api.searchquery.model.compiled.aggregate; - -import it.unimi.dsi.fastutil.longs.LongArraySet; -import it.unimi.dsi.fastutil.longs.LongOpenHashSet; -import it.unimi.dsi.fastutil.longs.LongSet; -import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; -import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; -import nu.marginalia.api.searchquery.model.compiled.CqExpression; - -import java.util.List; -import java.util.function.IntToLongFunction; -import java.util.function.LongUnaryOperator; -import java.util.function.ToLongFunction; - -public class CqPositionsOperator implements CqExpression.ObjectVisitor { - private final IntToLongFunction operator; - - public CqPositionsOperator(CompiledQuery query, ToLongFunction operator) { - this.operator = idx -> operator.applyAsLong(query.at(idx)); - } - - public CqPositionsOperator(CompiledQueryLong query, LongUnaryOperator operator) { - this.operator = idx -> operator.applyAsLong(query.at(idx)); - } - - @Override - public LongSet onAnd(List parts) { - LongSet ret = new LongArraySet(); - - for (var part : parts) { - ret = comineSets(ret, part.visit(this)); - } - - return ret; - } - - private LongSet comineSets(LongSet a, LongSet b) { - if (a.isEmpty()) - return b; - if (b.isEmpty()) - return a; - - LongSet ret = newSet(a.size() * b.size()); - - var ai = a.longIterator(); - - while (ai.hasNext()) { - long aval = ai.nextLong(); - - var bi = b.longIterator(); - while (bi.hasNext()) { - ret.add(aval & bi.nextLong()); - } - } - - return ret; - } - - @Override - public LongSet onOr(List parts) { - LongSet ret = newSet(parts.size()); - - for (var part : parts) { - ret.addAll(part.visit(this)); - } - - return ret; - } - - @Override - public LongSet onLeaf(int idx) { - var set = newSet(1); - set.add(operator.applyAsLong(idx)); - return set; - } - - /** Allocate a new set suitable for a collection with the provided cardinality */ - private LongSet newSet(int cardinality) { - if (cardinality < 8) - return new LongArraySet(cardinality); - else - return new LongOpenHashSet(cardinality); - } - -} diff --git a/code/index/index-forward/build.gradle b/code/index/index-forward/build.gradle index 3506281f..cb3a3c19 100644 --- a/code/index/index-forward/build.gradle +++ b/code/index/index-forward/build.gradle @@ -17,6 +17,7 @@ dependencies { implementation project(':code:libraries:btree') implementation project(':code:libraries:slop') implementation project(':code:libraries:coded-sequence') + implementation project(':code:libraries:language-processing') implementation project(':code:index:query') implementation project(':code:index:index-journal') implementation project(':code:common:model') diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java index cef76eb0..0d9eea61 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexParameters.java @@ -1,6 +1,6 @@ package nu.marginalia.index.forward; -class ForwardIndexParameters { +public class ForwardIndexParameters { public static final int ENTRY_SIZE = 3; public static final int METADATA_OFFSET = 0; public static final int FEATURES_OFFSET = 1; diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java index 902c7344..c4ab010d 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexReader.java @@ -3,11 +3,14 @@ package nu.marginalia.index.forward; import gnu.trove.map.hash.TLongIntHashMap; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; +import nu.marginalia.index.forward.spans.DocumentSpans; +import nu.marginalia.index.forward.spans.ForwardIndexSpansReader; import nu.marginalia.model.id.UrlIdCodec; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; +import java.lang.foreign.Arena; import java.nio.file.Files; import java.nio.file.Path; @@ -30,6 +33,7 @@ public class ForwardIndexReader { private final LongArray data; private final ForwardIndexSpansReader spansReader; + private final Logger logger = LoggerFactory.getLogger(getClass()); public ForwardIndexReader(Path idsFile, @@ -121,6 +125,21 @@ public class ForwardIndexReader { return idToOffset.get(docId); } + public DocumentSpans getDocumentSpans(Arena arena, long docId) { + long offset = idxForDoc(docId); + if (offset < 0) return new DocumentSpans(); + + long encodedOffset = data.get(ENTRY_SIZE * offset + SPANS_OFFSET); + + try { + return spansReader.readSpans(arena, encodedOffset); + } + catch (IOException ex) { + logger.error("Failed to read spans for doc " + docId, ex); + return new DocumentSpans(); + } + } + public int totalDocCount() { return idToOffset.size(); diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java b/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java similarity index 97% rename from code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java rename to code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java index 72bdd71f..a216b584 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexConverter.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/construction/ForwardIndexConverter.java @@ -1,9 +1,11 @@ -package nu.marginalia.index.forward; +package nu.marginalia.index.forward.construction; import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; import nu.marginalia.array.LongArray; import nu.marginalia.array.LongArrayFactory; import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.forward.ForwardIndexParameters; +import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter; import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.model.id.UrlIdCodec; import nu.marginalia.model.idx.DocumentMetadata; diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java new file mode 100644 index 00000000..f1f0c6c7 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpan.java @@ -0,0 +1,77 @@ +package nu.marginalia.index.forward.spans; + +import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.sequence.CodedSequence; +import nu.marginalia.sequence.SequenceOperations; + +public class DocumentSpan { + + /** A list of the interlaced start and end positions of each span in the document of this type */ + private final CodedSequence startsEnds; + + public DocumentSpan(CodedSequence startsEnds) { + this.startsEnds = startsEnds; + } + + public DocumentSpan() { + this.startsEnds = null; + } + + public boolean containsPosition(int position) { + if (startsEnds == null) { + return false; + } + + var iter = startsEnds.iterator(); + while (iter.hasNext()) { + int start = iter.nextInt(); + if (start > position) { + return false; + } + int end = iter.nextInt(); + if (end > position) { + return true; + } + } + + return false; + } + + public boolean containsRange(int rangeStart, int len) { + if (startsEnds == null) { + return false; + } + + var iter = startsEnds.iterator(); + while (iter.hasNext()) { + int start = iter.nextInt(); + if (start > rangeStart) { + return false; + } + int end = iter.nextInt(); + if (end > rangeStart + len) { + return true; + } + } + + return false; + } + + public boolean overlapsRange(CodedSequence sequence) { + return SequenceOperations.intersectSequences(iterator(), sequence.iterator()); + } + + /** Returns an iterator over the start and end positions of each span in the document of this type */ + public IntIterator iterator() { + if (null == startsEnds) { + return IntList.of().iterator(); + } + + return startsEnds.iterator(); + } + + public int size() { + return startsEnds.valueCount() / 2; + } +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java new file mode 100644 index 00000000..a8ed94f0 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/DocumentSpans.java @@ -0,0 +1,35 @@ +package nu.marginalia.index.forward.spans; + +import nu.marginalia.language.sentence.tag.HtmlTag; +import nu.marginalia.sequence.CodedSequence; + +public class DocumentSpans { + private static DocumentSpan EMPTY_SPAN = new DocumentSpan(); + + public DocumentSpan title = EMPTY_SPAN; + public DocumentSpan heading = EMPTY_SPAN; + + public DocumentSpan nav = EMPTY_SPAN; + public DocumentSpan pageHeader = EMPTY_SPAN; + public DocumentSpan pageFooter = EMPTY_SPAN; + public DocumentSpan code = EMPTY_SPAN; + public DocumentSpan pre = EMPTY_SPAN; + + void accept(byte code, CodedSequence positions) { + if (code == HtmlTag.HEADING.code) + this.heading = new DocumentSpan(positions); + else if (code == HtmlTag.TITLE.code) + this.title = new DocumentSpan(positions); + else if (code == HtmlTag.NAV.code) + this.nav = new DocumentSpan(positions); + else if (code == HtmlTag.PAGE_HEADER.code) + this.pageHeader = new DocumentSpan(positions); + else if (code == HtmlTag.PAGE_FOOTER.code) + this.pageFooter = new DocumentSpan(positions); + else if (code == HtmlTag.CODE.code) + this.code = new DocumentSpan(positions); + else if (code == HtmlTag.PRE.code) + this.pre = new DocumentSpan(positions); + } + +} diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansReader.java similarity index 56% rename from code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java rename to code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansReader.java index a670658d..5bbadb08 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansReader.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansReader.java @@ -1,6 +1,5 @@ -package nu.marginalia.index.forward; +package nu.marginalia.index.forward.spans; -import it.unimi.dsi.fastutil.ints.IntList; import nu.marginalia.sequence.GammaCodedSequence; import java.io.IOException; @@ -9,8 +8,6 @@ import java.nio.channels.FileChannel; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; -import java.util.ArrayList; -import java.util.List; @SuppressWarnings("preview") public class ForwardIndexSpansReader implements AutoCloseable { @@ -20,9 +17,9 @@ public class ForwardIndexSpansReader implements AutoCloseable { this.spansFileChannel = (FileChannel) Files.newByteChannel(spansFile, StandardOpenOption.READ); } - public List readSpans(Arena arena, long encodedOffset) throws IOException { - long size = encodedOffset & 0xFFF_FFFF; - long offset = encodedOffset >>> 28; + public DocumentSpans readSpans(Arena arena, long encodedOffset) throws IOException { + long size = SpansCodec.decodeSize(encodedOffset); + long offset = SpansCodec.decodeStartOffset(encodedOffset); var buffer = arena.allocate(size).asByteBuffer(); buffer.clear(); @@ -33,22 +30,16 @@ public class ForwardIndexSpansReader implements AutoCloseable { int count = buffer.get(); - List ret = new ArrayList<>(); + DocumentSpans ret = new DocumentSpans(); + while (count-- > 0) { byte code = buffer.get(); short len = buffer.getShort(); - final int pos = buffer.position(); - - // Decode the gamma-coded sequence; this will advance the buffer position - // in a not entirely predictable way, so we need to save the position - buffer.limit(buffer.position() + len); - var sequence = new GammaCodedSequence(buffer).values(); - ret.add(new SpanData(code, sequence)); + ret.accept(code, new GammaCodedSequence(buffer.slice(buffer.position(), len))); // Reset the buffer position to the end of the span - buffer.position(pos + len); - buffer.limit(buffer.capacity()); + buffer.position(buffer.position() + len); } return ret; @@ -59,5 +50,4 @@ public class ForwardIndexSpansReader implements AutoCloseable { spansFileChannel.close(); } - public record SpanData(byte code, IntList data) {} } diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansWriter.java similarity index 93% rename from code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java rename to code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansWriter.java index 973257c0..4bdebd59 100644 --- a/code/index/index-forward/java/nu/marginalia/index/forward/ForwardIndexSpansWriter.java +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/ForwardIndexSpansWriter.java @@ -1,4 +1,4 @@ -package nu.marginalia.index.forward; +package nu.marginalia.index.forward.spans; import java.io.IOException; import java.nio.ByteBuffer; @@ -42,8 +42,7 @@ public class ForwardIndexSpansWriter implements AutoCloseable { } public long endRecord() { - return stateStartOffset << 28 | stateLength; - + return SpansCodec.encode(stateStartOffset, stateLength); } @Override diff --git a/code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java b/code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java new file mode 100644 index 00000000..7330f593 --- /dev/null +++ b/code/index/index-forward/java/nu/marginalia/index/forward/spans/SpansCodec.java @@ -0,0 +1,17 @@ +package nu.marginalia.index.forward.spans; + +public class SpansCodec { + public static long encode(long startOffset, long size) { + assert size < 0x1000_0000L : "Size must be less than 2^28"; + + return startOffset << 28 | (size & 0xFFF_FFFFL); + } + + public static long decodeStartOffset(long encoded) { + return encoded >>> 28; + } + + public static long decodeSize(long encoded) { + return encoded & 0x0FFF_FFFFL; + } +} diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java index 0c5255d5..59026876 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexConverterTest.java @@ -2,6 +2,7 @@ package nu.marginalia.index.forward; import lombok.SneakyThrows; import nu.marginalia.index.domainrankings.DomainRankings; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournalSlopWriter; import nu.marginalia.model.id.UrlIdCodec; diff --git a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java index b77a0f5a..055a50a4 100644 --- a/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java +++ b/code/index/index-forward/test/nu/marginalia/index/forward/ForwardIndexSpansReaderTest.java @@ -1,6 +1,7 @@ package nu.marginalia.index.forward; -import it.unimi.dsi.fastutil.ints.IntList; +import nu.marginalia.index.forward.spans.ForwardIndexSpansReader; +import nu.marginalia.index.forward.spans.ForwardIndexSpansWriter; import nu.marginalia.sequence.GammaCodedSequence; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Test; @@ -11,7 +12,7 @@ import java.nio.ByteBuffer; import java.nio.file.Files; import java.nio.file.Path; -import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.*; class ForwardIndexSpansReaderTest { Path testFile = Files.createTempFile("test", ".idx"); @@ -32,12 +33,12 @@ class ForwardIndexSpansReaderTest { long offset2; try (var writer = new ForwardIndexSpansWriter(testFile)) { writer.beginRecord(1); - writer.writeSpan((byte) 'a', GammaCodedSequence.generate(wa, 1, 3, 5).buffer()); + writer.writeSpan((byte) 'h', GammaCodedSequence.generate(wa, 1, 3, 5, 8).buffer()); offset1 = writer.endRecord(); writer.beginRecord(2); - writer.writeSpan((byte) 'b', GammaCodedSequence.generate(wa, 2, 4, 6).buffer()); - writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 3, 5, 7).buffer()); + writer.writeSpan((byte) 'c', GammaCodedSequence.generate(wa, 2, 4, 6, 7).buffer()); + writer.writeSpan((byte) 'p', GammaCodedSequence.generate(wa, 3, 5).buffer()); offset2 = writer.endRecord(); } @@ -47,17 +48,21 @@ class ForwardIndexSpansReaderTest { var spans1 = reader.readSpans(arena, offset1); var spans2 = reader.readSpans(arena, offset2); - assertEquals(1, spans1.size()); + assertEquals(2, spans1.heading.size()); - assertEquals('a', spans1.get(0).code()); - assertEquals(IntList.of(1, 3, 5), spans1.get(0).data()); + assertEquals(2, spans2.code.size()); - assertEquals(2, spans2.size()); + assertFalse(spans2.code.containsPosition(1)); + assertTrue(spans2.code.containsPosition(3)); + assertFalse(spans2.code.containsPosition(5)); + assertTrue(spans2.code.containsPosition(6)); + assertFalse(spans2.code.containsPosition(7)); + assertFalse(spans2.code.containsPosition(8)); - assertEquals('b', spans2.get(0).code()); - assertEquals(IntList.of(2, 4, 6), spans2.get(0).data()); - assertEquals('c', spans2.get(1).code()); - assertEquals(IntList.of(3, 5, 7), spans2.get(1).data()); + assertEquals(1, spans2.pre.size()); + + assertEquals(0, spans2.pageFooter.size()); + assertFalse(spans2.pageFooter.containsPosition(8)); } } } \ No newline at end of file diff --git a/code/index/java/nu/marginalia/index/IndexGrpcService.java b/code/index/java/nu/marginalia/index/IndexGrpcService.java index 2b075e58..b16b456d 100644 --- a/code/index/java/nu/marginalia/index/IndexGrpcService.java +++ b/code/index/java/nu/marginalia/index/IndexGrpcService.java @@ -317,6 +317,9 @@ public class IndexGrpcService extends IndexApiGrpc.IndexApiImplBase { try { executeSearch(); } + catch (Exception ex) { + logger.error("Error in index lookup", ex); + } finally { synchronized (remainingIndexTasks) { if (remainingIndexTasks.decrementAndGet() == 0) { diff --git a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java index 01a5fd06..de52d1c5 100644 --- a/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java +++ b/code/index/java/nu/marginalia/index/index/CombinedIndexReader.java @@ -8,6 +8,7 @@ import nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggre import nu.marginalia.index.FullReverseIndexReader; import nu.marginalia.index.PrioReverseIndexReader; import nu.marginalia.index.forward.ForwardIndexReader; +import nu.marginalia.index.forward.spans.DocumentSpans; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.SearchTerms; import nu.marginalia.index.query.IndexQuery; @@ -186,11 +187,17 @@ public class CombinedIndexReader { /** Retrieves the HTML features for the specified document */ public int getHtmlFeatures(long docId) { return forwardIndexReader.getHtmlFeatures(docId); - } /** Retrieves the HTML features for the specified document */ + } + + /** Retrieves the HTML features for the specified document */ public int getDocumentSize(long docId) { return forwardIndexReader.getDocumentSize(docId); } + /** Retrieves the document spans for the specified document */ + public DocumentSpans getDocumentSpans(Arena arena, long docId) { + return forwardIndexReader.getDocumentSpans(arena, docId); + } /** Close the indexes (this is not done immediately) * */ diff --git a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java index 3973b016..59fda6f8 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultRankingService.java @@ -98,7 +98,7 @@ public class IndexResultRankingService { } // Calculate the preliminary score - var score = resultRanker.calculateScore(resultIds.at(i), searchTerms, flags, positions); + var score = resultRanker.calculateScore(arena, resultIds.at(i), searchTerms, flags, positions); if (score != null) { results.add(score); } diff --git a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java index 751839bd..127b1bbb 100644 --- a/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java +++ b/code/index/java/nu/marginalia/index/results/IndexResultScoreCalculator.java @@ -3,15 +3,18 @@ package nu.marginalia.index.results; import nu.marginalia.api.searchquery.model.compiled.CompiledQuery; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryInt; import nu.marginalia.api.searchquery.model.compiled.CompiledQueryLong; +import nu.marginalia.api.searchquery.model.compiled.aggregate.CqDoubleSumOperator; import nu.marginalia.api.searchquery.model.results.ResultRankingContext; import nu.marginalia.api.searchquery.model.results.ResultRankingParameters; import nu.marginalia.api.searchquery.model.results.SearchResultItem; +import nu.marginalia.index.forward.spans.DocumentSpans; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.model.QueryParams; import nu.marginalia.index.model.SearchParameters; import nu.marginalia.index.query.limit.QueryStrategy; import nu.marginalia.index.results.model.QuerySearchTerms; +import nu.marginalia.index.results.model.TermCoherenceGroupList; import nu.marginalia.model.crawl.HtmlFeature; import nu.marginalia.model.crawl.PubDate; import nu.marginalia.model.id.UrlIdCodec; @@ -22,6 +25,7 @@ import nu.marginalia.sequence.CodedSequence; import nu.marginalia.sequence.SequenceOperations; import javax.annotation.Nullable; +import java.lang.foreign.Arena; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.booleanAggregate; import static nu.marginalia.api.searchquery.model.compiled.aggregate.CompiledQueryAggregates.intMaxMinAggregate; @@ -50,7 +54,8 @@ public class IndexResultScoreCalculator { private final long flagsFilterMask = WordFlags.Title.asBit() | WordFlags.Subjects.asBit() | WordFlags.UrlDomain.asBit() | WordFlags.UrlPath.asBit() | WordFlags.ExternalLink.asBit(); @Nullable - public SearchResultItem calculateScore(long combinedId, + public SearchResultItem calculateScore(Arena arena, + long combinedId, QuerySearchTerms searchTerms, long[] wordFlags, CodedSequence[] positions) @@ -78,8 +83,7 @@ public class IndexResultScoreCalculator { long docMetadata = index.getDocumentMetadata(docId); int htmlFeatures = index.getHtmlFeatures(docId); int docSize = index.getDocumentSize(docId); - - int bestCoherence = searchTerms.coherences.testOptional(positions); + DocumentSpans spans = index.getDocumentSpans(arena, docId); double score = calculateSearchResultValue( wordFlagsQuery, @@ -88,7 +92,9 @@ public class IndexResultScoreCalculator { docMetadata, htmlFeatures, docSize, - bestCoherence, + spans, + positions, + searchTerms.coherences, rankingContext); SearchResultItem searchResult = new SearchResultItem(docId, @@ -169,10 +175,13 @@ public class IndexResultScoreCalculator { public double calculateSearchResultValue(CompiledQueryLong wordFlagsQuery, CompiledQueryInt positionsCountQuery, - CompiledQuery positionsQuery, long documentMetadata, + CompiledQuery positionsQuery, + long documentMetadata, int features, int length, - int bestCoherence, + DocumentSpans spans, + CodedSequence[] positions, + TermCoherenceGroupList coherences, ResultRankingContext ctx) { if (length < 0) { @@ -205,6 +214,33 @@ public class IndexResultScoreCalculator { temporalBias = 0; } + + int numCoherenceAll = coherences.countOptional(positions); + int bestCoherenceAll = coherences.testOptional(positions); + int bestCoherenceTitle = coherences.testOptional(positions, spans.title); + int bestCoherenceHeading = coherences.testOptional(positions, spans.heading); + + double spanWeightedScore = positionsQuery.root.visit(new CqDoubleSumOperator(positionsQuery, termPos -> { + if (termPos == null) + return 0; + + if (spans.title.overlapsRange(termPos)) + return 5.0; + if (spans.heading.overlapsRange(termPos)) + return 2.5; + if (spans.code.overlapsRange(termPos)) + return 0.25; + if (spans.pre.overlapsRange(termPos)) + return 0.25; + if (spans.nav.overlapsRange(termPos)) + return 0.25; + if (spans.pageHeader.overlapsRange(termPos)) + return 0.25; + if (spans.pageFooter.overlapsRange(termPos)) + return 0.25; + return 1.0; + })); + double overallPart = averageSentenceLengthPenalty + documentLengthPenalty + qualityPenalty @@ -212,7 +248,11 @@ public class IndexResultScoreCalculator { + topologyBonus + temporalBias + flagsPenalty - + bestCoherence; + + bestCoherenceAll + + bestCoherenceTitle + + bestCoherenceHeading + + numCoherenceAll / 4. + + spanWeightedScore; double tcfAvgDist = rankingParams.tcfAvgDist * (1.0 / calculateAvgMinDistance(positionsQuery, ctx)); double tcfFirstPosition = 0.; diff --git a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java index b8cce960..9096af7a 100644 --- a/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java +++ b/code/index/java/nu/marginalia/index/results/model/TermCoherenceGroupList.java @@ -2,6 +2,7 @@ package nu.marginalia.index.results.model; import it.unimi.dsi.fastutil.ints.IntIterator; import nu.marginalia.api.searchquery.model.query.SearchCoherenceConstraint; +import nu.marginalia.index.forward.spans.DocumentSpan; import nu.marginalia.index.model.SearchTermsUtil; import nu.marginalia.index.results.model.ids.TermIdList; import nu.marginalia.sequence.CodedSequence; @@ -40,7 +41,7 @@ public class TermCoherenceGroupList { public int testOptional(CodedSequence[] positions) { int best = 0; - for (var coherenceSet : mandatoryGroups) { + for (var coherenceSet : optionalGroups) { if (coherenceSet.test(positions)) { best = Math.max(coherenceSet.size, best); } @@ -48,6 +49,25 @@ public class TermCoherenceGroupList { return best; } + public int countOptional(CodedSequence[] positions) { + int ct = 0; + for (var coherenceSet : optionalGroups) { + if (coherenceSet.test(positions)) { + ct++; + } + } + return ct; + } + + public int testOptional(CodedSequence[] positions, DocumentSpan span) { + int best = 0; + for (var coherenceSet : optionalGroups) { + if (coherenceSet.test(span, positions)) { + best = Math.max(coherenceSet.size, best); + } + } + return best; + } public static final class TermCoherenceGroup { private final int[] offsets; @@ -92,5 +112,37 @@ public class TermCoherenceGroupList { return SequenceOperations.intersectSequences(sequences); } + + + public boolean test(DocumentSpan span, CodedSequence[] positions) { + IntIterator[] sequences = new IntIterator[present.cardinality()]; + + for (int oi = 0, si = 0; oi < offsets.length; oi++) { + if (!present.get(oi)) { + continue; + } + int offset = offsets[oi]; + if (offset < 0) + return false; + + // Create iterators that are offset by their relative position in the + // sequence. This is done by subtracting the index from the offset, + // so that when we intersect them, an overlap means that the terms are + // in the correct order. Note the offset is negative! + + sequences[si++] = positions[offset].offsetIterator(-oi); + } + + var intersections = SequenceOperations.findIntersections(sequences); + + for (int idx = 0; idx < intersections.size(); idx++) { + if (span.containsRange(intersections.getInt(idx), sequences.length)) { + return true; + } + } + + return false; + } + } } diff --git a/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java index f52d1b99..379ff399 100644 --- a/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java +++ b/code/index/test/nu/marginalia/index/CombinedIndexReaderTest.java @@ -9,8 +9,8 @@ import nu.marginalia.hash.MurmurHash3_128; import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.index.CombinedIndexReader; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.journal.IndexJournal; diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java index 39c54fa6..60501571 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationSmokeTest.java @@ -11,8 +11,8 @@ import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournalSlopWriter; diff --git a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java index 44c73cb8..eb83f714 100644 --- a/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java +++ b/code/index/test/nu/marginalia/index/IndexQueryServiceIntegrationTest.java @@ -13,8 +13,8 @@ import nu.marginalia.index.construction.DocIdRewriter; import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.journal.IndexJournalSlopWriter; diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java index 7a026862..11df084e 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/SequenceOperations.java @@ -1,6 +1,8 @@ package nu.marginalia.sequence; +import it.unimi.dsi.fastutil.ints.IntArrayList; import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; public class SequenceOperations { @@ -30,7 +32,7 @@ public class SequenceOperations { if (values[i] == max) { successes++; } else { - successes = 0; + successes = 1; // Discard values until we reach the maximum value seen so far, // or until the end of the sequence is reached @@ -49,6 +51,63 @@ public class SequenceOperations { return true; } + public static IntList findIntersections(IntIterator... sequences) { + + if (sequences.length <= 1) + return IntList.of(); + + // Initialize values and find the maximum value + int[] values = new int[sequences.length]; + + for (int i = 0; i < sequences.length; i++) { + if (sequences[i].hasNext()) + values[i] = sequences[i].nextInt(); + else + return IntList.of(); + } + + // Intersect the sequences by advancing all values smaller than the maximum seen so far + // until they are equal to the maximum value, or until the end of the sequence is reached + int max = Integer.MIN_VALUE; + int successes = 0; + + IntList ret = new IntArrayList(); + + outer: + for (int i = 0;; i = (i + 1) % sequences.length) + { + if (successes == sequences.length) { + ret.add(max); + successes = 1; + + if (sequences[i].hasNext()) { + max = sequences[i].nextInt(); + } else { + break; + } + } else if (values[i] == max) { + successes++; + } else { + successes = 1; + + // Discard values until we reach the maximum value seen so far, + // or until the end of the sequence is reached + while (values[i] < max) { + if (sequences[i].hasNext()) { + values[i] = sequences[i].nextInt(); + } else { + break outer; + } + } + + // Update the maximum value, if necessary + max = Math.max(max, values[i]); + } + } + + return ret; + } + /** Return the minimum word distance between two sequences, or a negative value if either sequence is empty. * */ public static int minDistance(IntIterator seqA, IntIterator seqB) diff --git a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java index 03f553c2..756ed7ab 100644 --- a/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java +++ b/code/libraries/coded-sequence/java/nu/marginalia/sequence/io/BitReader.java @@ -162,7 +162,15 @@ public class BitReader { } else { // There's no more data to read! refillCallback.run(); - readNext(); + if (underlying.hasRemaining()) { + readNext(); + } + else { + // We've attempted to re-fill the buffer, but there's still no data to read, so we fail to avoid + // blowing up the stack with recursion + throw new IllegalStateException("No more data to read after attempted re-fill of underlying buffer"); + } + } } } diff --git a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java index dbae6f29..e77ce0c5 100644 --- a/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java +++ b/code/libraries/coded-sequence/test/nu/marginalia/sequence/SequenceOperationsTest.java @@ -1,6 +1,6 @@ package nu.marginalia.sequence; -import it.unimi.dsi.fastutil.ints.IntIterator; +import it.unimi.dsi.fastutil.ints.IntList; import org.junit.jupiter.api.Test; import java.nio.ByteBuffer; @@ -63,6 +63,17 @@ class SequenceOperationsTest { assertTrue(SequenceOperations.intersectSequences(seq1.iterator(), seq2.iterator(), seq3.iterator())); } + @Test + void intersectSequencesDeepMatch3findIntersections() { + ByteBuffer wa = ByteBuffer.allocate(1024); + GammaCodedSequence seq1 = GammaCodedSequence.generate(wa, 1, 3, 4, 7, 8, 9, 10, 11); + GammaCodedSequence seq2 = GammaCodedSequence.generate(wa, 2, 5, 8, 10, 14); + GammaCodedSequence seq3 = GammaCodedSequence.generate(wa, 1, 5, 8, 9, 10); + + assertEquals(IntList.of(8, 10), SequenceOperations.findIntersections(seq1.iterator(), seq2.iterator(), seq3.iterator())); + } + + @Test void intersectSequencesDeepMismatch() { ByteBuffer wa = ByteBuffer.allocate(1024); diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java index 283e8959..d6b83f56 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlStringTagger.java @@ -54,8 +54,8 @@ public class HtmlStringTagger implements NodeVisitor { case "code" -> pushTag(HtmlTag.CODE, el); case "title" -> pushTag(HtmlTag.TITLE, el); case "nav" -> pushTag(HtmlTag.NAV, el); - case "header" -> pushTag(HtmlTag.HEADER, el); - case "footer" -> pushTag(HtmlTag.FOOTER, el); + case "header" -> pushTag(HtmlTag.PAGE_HEADER, el); + case "footer" -> pushTag(HtmlTag.PAGE_FOOTER, el); case "h1", "h2", "h3", "h4", "h5", "h6" -> pushTag(HtmlTag.HEADING, el); } } diff --git a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java index 51396990..f01f8461 100644 --- a/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java +++ b/code/libraries/language-processing/java/nu/marginalia/language/sentence/tag/HtmlTag.java @@ -1,21 +1,21 @@ package nu.marginalia.language.sentence.tag; public enum HtmlTag { - SCRIPT('s', true, false), - STYLE('S', true, false), - CODE('c', false, true), - PRE('p', false, true), - TITLE('t', false, false), - HEADING('h', false, false), - NAV('n', false, false), - HEADER('H',false, false), - FOOTER('f', false, false); + SCRIPT((byte) 's', true, false), + STYLE((byte) 'S', true, false), + CODE((byte) 'c', false, true), + PRE((byte) 'p', false, true), + TITLE((byte) 't', false, false), + HEADING((byte) 'h', false, false), + NAV((byte) 'n', false, false), + PAGE_HEADER((byte) 'H',false, false), + PAGE_FOOTER((byte) 'f', false, false); - public char code; + public byte code; public boolean exclude; public boolean nonLanguage; - HtmlTag(char code, boolean exclude, boolean nonLanguage) { + HtmlTag(byte code, boolean exclude, boolean nonLanguage) { this.code = code; this.exclude = exclude; this.nonLanguage = nonLanguage; diff --git a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java index d73495be..693e94a2 100644 --- a/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java +++ b/code/processes/converting-process/ft-keyword-extraction/java/nu/marginalia/keyword/model/DocumentKeywordsBuilder.java @@ -144,7 +144,7 @@ public class DocumentKeywordsBuilder { public void addSpans(List newSpans) { for (var span : newSpans) { - wordSpans.computeIfAbsent(span.tag().code, k -> new ArrayList<>()).add(span); + wordSpans.computeIfAbsent((char) span.tag().code, k -> new ArrayList<>()).add(span); } } diff --git a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java index 6c55db6c..ef93b554 100644 --- a/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java +++ b/code/processes/index-constructor-process/java/nu/marginalia/index/IndexConstructorMain.java @@ -9,8 +9,8 @@ import nu.marginalia.ProcessConfigurationModule; import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.model.gson.GsonFactory; import nu.marginalia.model.id.UrlIdCodec; diff --git a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java index 7ec8841b..820525b9 100644 --- a/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java +++ b/code/tools/integration-test/test/nu/marginalia/IntegrationTest.java @@ -18,8 +18,8 @@ import nu.marginalia.index.ReverseIndexPrioFileNames; import nu.marginalia.index.construction.full.FullIndexConstructor; import nu.marginalia.index.construction.prio.PrioIndexConstructor; import nu.marginalia.index.domainrankings.DomainRankings; -import nu.marginalia.index.forward.ForwardIndexConverter; import nu.marginalia.index.forward.ForwardIndexFileNames; +import nu.marginalia.index.forward.construction.ForwardIndexConverter; import nu.marginalia.index.index.StatefulIndex; import nu.marginalia.index.journal.IndexJournal; import nu.marginalia.index.model.SearchParameters;